From 6fa6cdd2b988da98cbdd2b1a5fd2fd3b9d56a4b1 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 4 Jun 2014 09:21:48 -0700
Subject: Added support for tensor contractions Updated expression evaluation
 mechanism to also compute the size of the tensor result Misc fixes and
 improvements.

---
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     | 229 +++++++++++++++++++++
 1 file changed, 229 insertions(+)
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
new file mode 100644
index 000000000..d424df36e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -0,0 +1,229 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+
+namespace Eigen {
+
+/** \class TensorContraction
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor contraction class.
+  *
+  *
+  */
+namespace internal {
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
+                                                  typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename Indices, typename LhsXprType, typename RhsXprType>
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
+                                                  typename RhsXprType::PacketReturnType>::ret PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
+
+    EIGEN_DEVICE_FUNC
+    const Indices& indices() const { return m_indices; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return m_lhs_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const Indices m_indices;
+};
+
+
+template <size_t n> struct max_n_1 {
+  static const size_t size = n;
+};
+template <> struct max_n_1<0> {
+  static const size_t size = 1;
+};
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType> >
+{
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+
+  static const int NumDims = max_n_1<TensorEvaluator<LeftArgType>::Dimensions::count + TensorEvaluator<RightArgType>::Dimensions::count - 2 * Indices::size>::size;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  enum {
+    IsAligned = TensorEvaluator<LeftArgType>::IsAligned & TensorEvaluator<RightArgType>::IsAligned,
+    PacketAccess = /*TensorEvaluator<LeftArgType>::PacketAccess & TensorEvaluator<RightArgType>::PacketAccess */
+                   false,
+  };
+
+  TensorEvaluator(const XprType& op)
+      : m_leftImpl(op.lhsExpression()), m_rightImpl(op.rhsExpression())
+  {
+    Index index = 0;
+    Index stride = 1;
+    m_shiftright = 1;
+
+    int skipped = 0;
+    const typename TensorEvaluator<LeftArgType>::Dimensions& left_dims = m_leftImpl.dimensions();
+    for (int i = 0; i < TensorEvaluator<LeftArgType>::Dimensions::count; ++i) {
+      bool skip = false;
+      for (int j = 0; j < Indices::size; ++j) {
+        if (op.indices()[j].first == i) {
+          skip = true;
+          m_leftOffsets[2*skipped] = stride;
+          m_leftOffsets[2*skipped+1] = stride * left_dims[i];
+          m_stitchsize[skipped] = left_dims[i];
+          break;
+        }
+      }
+      if (!skip) {
+        m_dimensions[index++] = left_dims[i];
+        m_shiftright *= left_dims[i];
+      } else {
+        ++skipped;
+      }
+      stride *= left_dims[i];
+    }
+
+    stride = 1;
+    skipped = 0;
+    const typename TensorEvaluator<RightArgType>::Dimensions& right_dims = m_rightImpl.dimensions();
+    for (int i = 0; i < TensorEvaluator<RightArgType>::Dimensions::count; ++i) {
+      bool skip = false;
+      for (int j = 0; j < Indices::size; ++j) {
+        if (op.indices()[j].second == i) {
+          skip = true;
+          m_rightOffsets[2*skipped] = stride;
+          m_rightOffsets[2*skipped+1] = stride * right_dims[i];
+          break;
+        }
+      }
+      if (!skip) {
+        m_dimensions[index++] = right_dims[i];
+      } else {
+        ++skipped;
+      }
+      stride *= right_dims[i];
+    }
+
+    // Scalar case
+    if (TensorEvaluator<LeftArgType>::Dimensions::count + TensorEvaluator<LeftArgType>::Dimensions::count == 2 * Indices::size) {
+      m_dimensions[0] = 1;
+    }
+  }
+
+  //  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  const Dimensions& dimensions() const { return m_dimensions; }
+
+  void evalTo(typename XprType::Scalar* buffer) const {
+    for (int i = 0; i < dimensions().TotalSize(); ++i) {
+      buffer[i] += coeff(i);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    const Index startLeft = index % m_shiftright;
+    const Index startRight = index / m_shiftright;
+    CoeffReturnType result = CoeffReturnType(0);
+    partialStitch(startLeft, startRight, 0, result);
+    return result;
+  }
+
+  /* TODO: vectorization
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  {
+    assert(false);
+  }*/
+
+ private:
+  EIGEN_DEVICE_FUNC void partialStitch(Index startLeft, Index startRight, int StitchIndex, CoeffReturnType& accum) const {
+    Index firstLeft = (startLeft / m_leftOffsets[2*StitchIndex]) * m_leftOffsets[2*StitchIndex+1] + (startLeft % m_leftOffsets[2*StitchIndex]);
+    Index firstRight = (startRight / m_rightOffsets[2*StitchIndex]) * m_rightOffsets[2*StitchIndex+1] + (startRight % m_rightOffsets[2*StitchIndex]);
+
+    for (int j = 0; j < m_stitchsize[StitchIndex]; ++j) {
+      const Index left = firstLeft+j*m_leftOffsets[2*StitchIndex];
+      const Index right = firstRight+j*m_rightOffsets[2*StitchIndex];
+      if (StitchIndex < Indices::size-1) {
+        partialStitch(left, right, StitchIndex+1, accum);
+      } else {
+        accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right);
+      }
+    }
+  }
+
+ private:
+  array<Index, 2*Indices::size> m_leftOffsets;
+  array<Index, 2*Indices::size> m_rightOffsets;
+  array<Index, Indices::size> m_stitchsize;
+  Index m_shiftright;
+  Dimensions m_dimensions;
+  TensorEvaluator<LeftArgType> m_leftImpl;
+  TensorEvaluator<RightArgType> m_rightImpl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
-- 
cgit v1.2.3


From a669052f12d6d71ba815764d6419726d64fef675 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 9 Jun 2014 09:45:30 -0700
Subject: Improved support for rvalues in tensor expressions.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBase.h    | 58 ++++++++++++++++------
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     |  4 ++
 .../Eigen/CXX11/src/Tensor/TensorConvolution.h     |  4 ++
 unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h    |  8 +++
 .../CXX11/src/Tensor/TensorForwardDeclarations.h   |  6 ++-
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h        |  5 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h  |  6 ++-
 7 files changed, 71 insertions(+), 20 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 932e5c82d..e447a5d40 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -22,7 +22,7 @@ namespace Eigen {
   */
 
 template<typename Derived>
-class TensorBase
+class TensorBase<Derived, ReadOnlyAccessors>
 {
   public:
     typedef typename internal::traits<Derived>::Scalar Scalar;
@@ -30,19 +30,6 @@ class TensorBase
     typedef Scalar CoeffReturnType;
     typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Derived& setZero() {
-      return setConstant(Scalar(0));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) {
-      return derived() = constant(val);
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Derived& setRandom() {
-      return derived() = random();
-    }
-
     // Nullary operators
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
@@ -224,14 +211,53 @@ class TensorBase
       return TensorReshapingOp<const Derived, const NewDimensions>(derived(), newDimensions);
     }
 
+  protected:
+    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+
+template<typename Derived>
+class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
+ public:
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef typename internal::traits<Derived>::Index Index;
+    typedef Scalar CoeffReturnType;
+    typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
+
+    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setZero() {
+      return setConstant(Scalar(0));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) {
+      return derived() = this->constant(val);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setRandom() {
+      return derived() = this->random();
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator+=(const OtherDerived& other) {
+      return derived() = TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator-=(const OtherDerived& other) {
+      return derived() = TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+    }
+
     // Select the device on which to evaluate the expression.
     template <typename DeviceType>
     TensorDevice<Derived, DeviceType> device(const DeviceType& device) {
       return TensorDevice<Derived, DeviceType>(device, derived());
     }
 
-  protected:
-    template <typename OtherDerived> friend class TensorBase;
+ protected:
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
     EIGEN_DEVICE_FUNC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index d424df36e..d371eb76d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -35,6 +35,10 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
   typedef typename RhsXprType::Nested RhsNested;
   typedef typename remove_reference<LhsNested>::type _LhsNested;
   typedef typename remove_reference<RhsNested>::type _RhsNested;
+
+  enum {
+    Flags = 0,
+  };
 };
 
 template<typename Dimensions, typename LhsXprType, typename RhsXprType>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index ca2e0e562..501e9a522 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -35,6 +35,10 @@ struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
   typedef typename KernelXprType::Nested RhsNested;
   typedef typename remove_reference<LhsNested>::type _LhsNested;
   typedef typename remove_reference<RhsNested>::type _RhsNested;
+
+  enum {
+    Flags = 0,
+  };
 };
 
 template<typename Dimensions, typename InputXprType, typename KernelXprType>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 60908ee94..de66da13f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -36,6 +36,10 @@ struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::Nested XprTypeNested;
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+
+  enum {
+    Flags = 0,
+  };
 };
 
 }  // end namespace internal
@@ -153,6 +157,10 @@ struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
   typedef typename RhsXprType::Nested RhsNested;
   typedef typename remove_reference<LhsNested>::type _LhsNested;
   typedef typename remove_reference<RhsNested>::type _RhsNested;
+
+  enum {
+    Flags = 0,
+  };
 };
 
 template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index b8833362c..1fb90478f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -15,7 +15,7 @@ namespace Eigen {
 template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0> class Tensor;
 template<typename Scalar_, typename Dimensions, int Options_ = 0> class TensorFixedSize;
 template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
-template<typename Derived> class TensorBase;
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
 
 template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
 template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
@@ -29,6 +29,10 @@ template<typename ExpressionType, typename DeviceType> class TensorDevice;
 // Move to internal?
 template<typename Derived> struct TensorEvaluator;
 
+namespace internal {
+template<typename Derived, typename OtherDerived, bool Vectorizable> struct TensorAssign;
+}  // end namespace internal
+
 }  // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 3e089fe1e..7d5f9271e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -21,7 +21,7 @@ namespace Eigen {
   */
 namespace internal {
 template<typename XprType, typename NewDimensions>
-struct traits<TensorReshapingOp<XprType, NewDimensions> >
+struct traits<TensorReshapingOp<XprType, NewDimensions> > : public traits<XprType>
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename XprType::Scalar Scalar;
@@ -81,6 +81,7 @@ template<typename ArgType, typename NewDimensions>
 struct TensorEvaluator<const TensorReshapingOp<ArgType, NewDimensions> >
 {
   typedef TensorReshapingOp<ArgType, NewDimensions> XprType;
+  typedef NewDimensions Dimensions;
 
   enum {
     IsAligned = TensorEvaluator<ArgType>::IsAligned,
@@ -95,7 +96,7 @@ struct TensorEvaluator<const TensorReshapingOp<ArgType, NewDimensions> >
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
-  const NewDimensions& dimensions() const { return m_dimensions; }
+  const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 2de698a57..40f805741 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -52,7 +52,7 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_> >
   typedef DenseIndex Index;
   enum {
     Options = Options_,
-    Flags = compute_tensor_flags<Scalar_, Options_>::ret,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
   };
 };
 
@@ -63,6 +63,10 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> >
   typedef Scalar_ Scalar;
   typedef Dense StorageKind;
   typedef DenseIndex Index;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
+  };
 };
 
 
-- 
cgit v1.2.3


From a77458a8ff2a83e716add62253eb50ef64980b21 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 9 Jun 2014 10:06:57 -0700
Subject: Fixes compilation errors triggered when compiling the tensor
 contraction code with cxx11 enabled.

---
 .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h        |  6 ++++++
 .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h        | 17 +++++++++++++----
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h  | 16 ++++++++--------
 3 files changed, 27 insertions(+), 12 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
index f102872ae..423ca4be4 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
@@ -66,6 +66,12 @@ template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_
 
 #undef STD_GET_ARR_HACK
 
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
+  static const size_t value = N;
+};
+
+
 /* Suppose you have a template of the form
  * template<typename T> struct X;
  * And you want to specialize it in such a way:
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
index 636063f9e..1d3164d6a 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@@ -182,23 +182,32 @@ array<t, n> repeat(t v) {
 }
 
 template<std::size_t n, typename t>
-t array_prod(const array<t, n>& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
   t prod = 1;
   for (size_t i = 0; i < n; ++i) { prod *= a[i]; }
   return prod;
 }
 template<typename t>
-t array_prod(const array<t, 0>& /*a*/) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
   return 0;
 }
 
-template<std::size_t I, class T, std::size_t N> inline T& array_get(array<T,N>& a) {
+template<std::size_t I, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
   return a[I];
 }
-template<std::size_t I, class T, std::size_t N> inline const T& array_get(const array<T,N>& a) {
+template<std::size_t I, class T, std::size_t N> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const T& array_get(const array<T,N>& a) {
   return a[I];
 }
 
+
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N> > {
+  static const size_t value = N;
+};
+
+
 struct sum_op {
   template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
 };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index d371eb76d..5149de1bb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -107,7 +107,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 {
   typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
 
-  static const int NumDims = max_n_1<TensorEvaluator<LeftArgType>::Dimensions::count + TensorEvaluator<RightArgType>::Dimensions::count - 2 * Indices::size>::size;
+  static const int NumDims = max_n_1<TensorEvaluator<LeftArgType>::Dimensions::count + TensorEvaluator<RightArgType>::Dimensions::count - 2 * internal::array_size<Indices>::value>::size;
   typedef typename XprType::Index Index;
   typedef DSizes<Index, NumDims> Dimensions;
 
@@ -128,7 +128,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     const typename TensorEvaluator<LeftArgType>::Dimensions& left_dims = m_leftImpl.dimensions();
     for (int i = 0; i < TensorEvaluator<LeftArgType>::Dimensions::count; ++i) {
       bool skip = false;
-      for (int j = 0; j < Indices::size; ++j) {
+      for (int j = 0; j < internal::array_size<Indices>::value; ++j) {
         if (op.indices()[j].first == i) {
           skip = true;
           m_leftOffsets[2*skipped] = stride;
@@ -151,7 +151,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     const typename TensorEvaluator<RightArgType>::Dimensions& right_dims = m_rightImpl.dimensions();
     for (int i = 0; i < TensorEvaluator<RightArgType>::Dimensions::count; ++i) {
       bool skip = false;
-      for (int j = 0; j < Indices::size; ++j) {
+      for (int j = 0; j < internal::array_size<Indices>::value; ++j) {
         if (op.indices()[j].second == i) {
           skip = true;
           m_rightOffsets[2*skipped] = stride;
@@ -168,7 +168,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
 
     // Scalar case
-    if (TensorEvaluator<LeftArgType>::Dimensions::count + TensorEvaluator<LeftArgType>::Dimensions::count == 2 * Indices::size) {
+    if (TensorEvaluator<LeftArgType>::Dimensions::count + TensorEvaluator<LeftArgType>::Dimensions::count == 2 * internal::array_size<Indices>::value) {
       m_dimensions[0] = 1;
     }
   }
@@ -209,7 +209,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     for (int j = 0; j < m_stitchsize[StitchIndex]; ++j) {
       const Index left = firstLeft+j*m_leftOffsets[2*StitchIndex];
       const Index right = firstRight+j*m_rightOffsets[2*StitchIndex];
-      if (StitchIndex < Indices::size-1) {
+      if (StitchIndex < internal::array_size<Indices>::value-1) {
         partialStitch(left, right, StitchIndex+1, accum);
       } else {
         accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right);
@@ -218,9 +218,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   }
 
  private:
-  array<Index, 2*Indices::size> m_leftOffsets;
-  array<Index, 2*Indices::size> m_rightOffsets;
-  array<Index, Indices::size> m_stitchsize;
+  array<Index, 2*internal::array_size<Indices>::value> m_leftOffsets;
+  array<Index, 2*internal::array_size<Indices>::value> m_rightOffsets;
+  array<Index, internal::array_size<Indices>::value> m_stitchsize;
   Index m_shiftright;
   Dimensions m_dimensions;
   TensorEvaluator<LeftArgType> m_leftImpl;
-- 
cgit v1.2.3


From 925fb6b93710b95082ba44d30405289dff3707eb Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 10 Jun 2014 09:14:44 -0700
Subject: TensorEval are now typed on the device: this will make it possible to
 use partial template specialization to optimize the strategy of each
 evaluator for each device type. Started work on partial evaluations.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h  | 42 ++++++------
 unsupported/Eigen/CXX11/src/Tensor/TensorBase.h    | 14 ++--
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     | 26 ++++----
 .../Eigen/CXX11/src/Tensor/TensorConvolution.h     | 20 +++---
 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h  |  2 +-
 .../Eigen/CXX11/src/Tensor/TensorDeviceType.h      | 28 ++++++--
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       | 76 +++++++++++-----------
 .../CXX11/src/Tensor/TensorForwardDeclarations.h   |  9 +--
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h        | 14 ++--
 9 files changed, 129 insertions(+), 102 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index da1eb62cb..633a7a31b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -32,15 +32,15 @@ namespace Eigen {
 namespace internal {
 
 // Default strategy: the expressions are evaluated with a single cpu thread.
-template<typename Derived1, typename Derived2, bool Vectorizable = TensorEvaluator<Derived1>::PacketAccess & TensorEvaluator<Derived2>::PacketAccess>
+template<typename Derived1, typename Derived2, typename Device = DefaultDevice, bool Vectorizable = TensorEvaluator<Derived1, Device>::PacketAccess & TensorEvaluator<Derived2, Device>::PacketAccess>
 struct TensorAssign
 {
   typedef typename Derived1::Index Index;
   EIGEN_DEVICE_FUNC
-  static inline void run(Derived1& dst, const Derived2& src)
+  static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device())
   {
-    TensorEvaluator<Derived1> evalDst(dst);
-    TensorEvaluator<Derived2> evalSrc(src);
+    TensorEvaluator<Derived1, Device> evalDst(dst, device);
+    TensorEvaluator<Derived2, Device> evalSrc(src, device);
     const Index size = dst.size();
     for (Index i = 0; i < size; ++i) {
       evalDst.coeffRef(i) = evalSrc.coeff(i);
@@ -49,19 +49,19 @@ struct TensorAssign
 };
 
 
-template<typename Derived1, typename Derived2>
-struct TensorAssign<Derived1, Derived2, true>
+template<typename Derived1, typename Derived2, typename Device>
+struct TensorAssign<Derived1, Derived2, Device, true>
 {
   typedef typename Derived1::Index Index;
-  static inline void run(Derived1& dst, const Derived2& src)
+  static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device())
   {
-    TensorEvaluator<Derived1> evalDst(dst);
-    TensorEvaluator<Derived2> evalSrc(src);
+    TensorEvaluator<Derived1, Device> evalDst(dst, device);
+    TensorEvaluator<Derived2, Device> evalSrc(src, device);
     const Index size = dst.size();
 
-    static const int LhsStoreMode = TensorEvaluator<Derived1>::IsAligned ? Aligned : Unaligned;
-    static const int RhsLoadMode = TensorEvaluator<Derived2>::IsAligned ? Aligned : Unaligned;
-    static const int PacketSize = unpacket_traits<typename TensorEvaluator<Derived1>::PacketReturnType>::size;
+    static const int LhsStoreMode = TensorEvaluator<Derived1, Device>::IsAligned ? Aligned : Unaligned;
+    static const int RhsLoadMode = TensorEvaluator<Derived2, Device>::IsAligned ? Aligned : Unaligned;
+    static const int PacketSize = unpacket_traits<typename TensorEvaluator<Derived1, Device>::PacketReturnType>::size;
     const int VectorizedSize = (size / PacketSize) * PacketSize;
 
     for (Index i = 0; i < VectorizedSize; i += PacketSize) {
@@ -116,12 +116,12 @@ struct TensorAssignMultiThreaded
   typedef typename Derived1::Index Index;
   static inline void run(Derived1& dst, const Derived2& src, const ThreadPoolDevice& device)
   {
-    TensorEvaluator<Derived1> evalDst(dst);
-    TensorEvaluator<Derived2> evalSrc(src);
+    TensorEvaluator<Derived1, DefaultDevice> evalDst(dst, DefaultDevice());
+    TensorEvaluator<Derived2, DefaultDevice> evalSrc(src, Defaultevice());
     const Index size = dst.size();
 
-    static const bool Vectorizable = TensorEvaluator<Derived1>::PacketAccess & TensorEvaluator<Derived2>::PacketAccess;
-    static const int PacketSize = Vectorizable ? unpacket_traits<typename TensorEvaluator<Derived1>::PacketReturnType>::size : 1;
+    static const bool Vectorizable = TensorEvaluator<Derived1, DefaultDevice>::PacketAccess & TensorEvaluator<Derived2, DefaultDevice>::PacketAccess;
+    static const int PacketSize = Vectorizable ? unpacket_traits<typename TensorEvaluator<Derived1, DefaultDevice>::PacketReturnType>::size : 1;
 
     int blocksz = static_cast<int>(ceil(static_cast<float>(size)/device.numThreads()) + PacketSize - 1);
     const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
@@ -131,7 +131,7 @@ struct TensorAssignMultiThreaded
     vector<std::future<void> > results;
     results.reserve(numblocks);
     for (int i = 0; i < numblocks; ++i) {
-      results.push_back(std::async(std::launch::async, &EvalRange<TensorEvaluator<Derived1>, TensorEvaluator<Derived2>, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize));
+      results.push_back(std::async(std::launch::async, &EvalRange<TensorEvaluator<Derived1, DefaultDevice>, TensorEvaluator<Derived2, DefaultDevice>, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize));
     }
 
     for (int i = 0; i < numblocks; ++i) {
@@ -167,19 +167,19 @@ struct TensorAssignGpu
   typedef typename Derived1::Index Index;
   static inline void run(Derived1& dst, const Derived2& src, const GpuDevice& device)
   {
-    TensorEvaluator<Derived1> evalDst(dst);
-    TensorEvaluator<Derived2> evalSrc(src);
+    TensorEvaluator<Derived1, GpuDevice> evalDst(dst, device);
+    TensorEvaluator<Derived2, GpuDevice> evalSrc(src, device);
     const Index size = dst.size();
     const int block_size = std::min<int>(size, 32*32);
     const int num_blocks = size / block_size;
-    EigenMetaKernelNoCheck<TensorEvaluator<Derived1>, TensorEvaluator<Derived2> > <<<num_blocks, block_size, 0, device.stream()>>>(evalDst, evalSrc);
+    EigenMetaKernelNoCheck<TensorEvaluator<Derived1, GpuDevice>, TensorEvaluator<Derived2, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evalDst, evalSrc);
 
     const int remaining_items = size % block_size;
     if (remaining_items > 0) {
       const int peel_start_offset = num_blocks * block_size;
       const int peel_block_size = std::min<int>(size, 32);
       const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size;
-      EigenMetaKernelPeel<TensorEvaluator<Derived1>, TensorEvaluator<Derived2> > <<<peel_num_blocks, peel_block_size, 0, device.stream()>>>(evalDst, evalSrc, peel_start_offset, size);
+      EigenMetaKernelPeel<TensorEvaluator<Derived1, GpuDevice>, TensorEvaluator<Derived2, GpuDevice> > <<<peel_num_blocks, peel_block_size, 0, device.stream()>>>(evalDst, evalSrc, peel_start_offset, size);
     }
   }
 };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index e447a5d40..6b53d2a3d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -198,19 +198,25 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     // Coefficient-wise ternary operators.
-    template<typename ThenDerived, typename ElseDerived>
-    inline const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>
+    template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>
     select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const {
       return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived());
     }
 
     // Morphing operators (slicing tbd).
-    template <typename NewDimensions>
-    inline const TensorReshapingOp<const Derived, const NewDimensions>
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const Derived, const NewDimensions>
     reshape(const NewDimensions& newDimensions) const {
       return TensorReshapingOp<const Derived, const NewDimensions>(derived(), newDimensions);
     }
 
+    // Force the evaluation of the expression.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorForcedEvalOp<const Derived> eval() const {
+      return TensorForcedEvalOp<const Derived>(derived());
+    }
+
   protected:
     template <typename OtherDerived, int AccessLevel> friend class TensorBase;
     EIGEN_DEVICE_FUNC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 5149de1bb..cadbabda2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -102,31 +102,31 @@ template <> struct max_n_1<0> {
 };
 
 
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType> >
+template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device>
 {
   typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
 
-  static const int NumDims = max_n_1<TensorEvaluator<LeftArgType>::Dimensions::count + TensorEvaluator<RightArgType>::Dimensions::count - 2 * internal::array_size<Indices>::value>::size;
+  static const int NumDims = max_n_1<TensorEvaluator<LeftArgType, Device>::Dimensions::count + TensorEvaluator<RightArgType, Device>::Dimensions::count - 2 * internal::array_size<Indices>::value>::size;
   typedef typename XprType::Index Index;
   typedef DSizes<Index, NumDims> Dimensions;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType>::IsAligned & TensorEvaluator<RightArgType>::IsAligned,
+    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
     PacketAccess = /*TensorEvaluator<LeftArgType>::PacketAccess & TensorEvaluator<RightArgType>::PacketAccess */
                    false,
   };
 
-  TensorEvaluator(const XprType& op)
-      : m_leftImpl(op.lhsExpression()), m_rightImpl(op.rhsExpression())
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device)
   {
     Index index = 0;
     Index stride = 1;
     m_shiftright = 1;
 
     int skipped = 0;
-    const typename TensorEvaluator<LeftArgType>::Dimensions& left_dims = m_leftImpl.dimensions();
-    for (int i = 0; i < TensorEvaluator<LeftArgType>::Dimensions::count; ++i) {
+    const typename TensorEvaluator<LeftArgType, Device>::Dimensions& left_dims = m_leftImpl.dimensions();
+    for (int i = 0; i < TensorEvaluator<LeftArgType, Device>::Dimensions::count; ++i) {
       bool skip = false;
       for (int j = 0; j < internal::array_size<Indices>::value; ++j) {
         if (op.indices()[j].first == i) {
@@ -148,8 +148,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
     stride = 1;
     skipped = 0;
-    const typename TensorEvaluator<RightArgType>::Dimensions& right_dims = m_rightImpl.dimensions();
-    for (int i = 0; i < TensorEvaluator<RightArgType>::Dimensions::count; ++i) {
+    const typename TensorEvaluator<RightArgType, Device>::Dimensions& right_dims = m_rightImpl.dimensions();
+    for (int i = 0; i < TensorEvaluator<RightArgType, Device>::Dimensions::count; ++i) {
       bool skip = false;
       for (int j = 0; j < internal::array_size<Indices>::value; ++j) {
         if (op.indices()[j].second == i) {
@@ -168,7 +168,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
 
     // Scalar case
-    if (TensorEvaluator<LeftArgType>::Dimensions::count + TensorEvaluator<LeftArgType>::Dimensions::count == 2 * internal::array_size<Indices>::value) {
+    if (TensorEvaluator<LeftArgType, Device>::Dimensions::count + TensorEvaluator<LeftArgType, Device>::Dimensions::count == 2 * internal::array_size<Indices>::value) {
       m_dimensions[0] = 1;
     }
   }
@@ -223,8 +223,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   array<Index, internal::array_size<Indices>::value> m_stitchsize;
   Index m_shiftright;
   Dimensions m_dimensions;
-  TensorEvaluator<LeftArgType> m_leftImpl;
-  TensorEvaluator<RightArgType> m_rightImpl;
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 501e9a522..a554b8260 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -94,27 +94,27 @@ class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, Input
 };
 
 
-template<typename Indices, typename InputArgType, typename KernelArgType>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType> >
+template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
 {
   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
 
-  static const int NumDims = TensorEvaluator<InputArgType>::Dimensions::count;
+  static const int NumDims = TensorEvaluator<InputArgType, Device>::Dimensions::count;
   static const int KernelDims = Indices::size;
   typedef typename XprType::Index Index;
   typedef DSizes<Index, NumDims> Dimensions;
 
   enum {
-    IsAligned = TensorEvaluator<InputArgType>::IsAligned & TensorEvaluator<KernelArgType>::IsAligned,
+    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
     PacketAccess = /*TensorEvaluator<InputArgType>::PacketAccess & TensorEvaluator<KernelArgType>::PacketAccess */
                    false,
   };
 
-  TensorEvaluator(const XprType& op)
-      : m_inputImpl(op.inputExpression()), m_kernelImpl(op.kernelExpression()), m_dimensions(op.inputExpression().dimensions())
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_dimensions(op.inputExpression().dimensions())
   {
-    const typename TensorEvaluator<InputArgType>::Dimensions& input_dims = m_inputImpl.dimensions();
-    const typename TensorEvaluator<KernelArgType>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
 
     for (int i = 0; i < NumDims; ++i) {
       if (i > 0) {
@@ -200,8 +200,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   array<Index, KernelDims> m_indexStride;
   array<Index, KernelDims> m_kernelStride;
   Dimensions m_dimensions;
-  TensorEvaluator<InputArgType> m_inputImpl;
-  TensorEvaluator<KernelArgType> m_kernelImpl;
+  TensorEvaluator<InputArgType, Device> m_inputImpl;
+  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index dbe60a165..ce524a818 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -31,7 +31,7 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
 
     template<typename OtherDerived>
     EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
-      internal::TensorAssign<ExpressionType, const OtherDerived>::run(m_expression, other);
+      internal::TensorAssign<ExpressionType, const OtherDerived, DeviceType>::run(m_expression, other, m_device);
       return *this;
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
index d7f5ab7c9..142edda14 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -15,6 +15,12 @@ namespace Eigen {
 
 // Default device for the machine (typically a single cpu core)
 struct DefaultDevice {
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
 };
 
 
@@ -22,14 +28,19 @@ struct DefaultDevice {
 // We should really use a thread pool here but first we need to find a portable thread pool library.
 #ifdef EIGEN_USE_THREADS
 struct ThreadPoolDevice {
-  ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { }
+ ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { }
   size_t numThreads() const { return num_threads_; }
-  /*ThreadPool* threadPool() const { return pool_; }*/
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    internal::aligned_free(buffer);
+  }
 
  private:
   // todo: NUMA, ...
   size_t num_threads_;
-  /*ThreadPool* pool_;*/
 };
 #endif
 
@@ -40,7 +51,16 @@ struct GpuDevice {
   // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
   GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
 
-  const cudaStream_t& stream() const { return *stream_; }
+  EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    void* result;
+    cudaMalloc(&result, num_bytes);
+    return result;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    cudaFree(buffer);
+  }
 
  private:
   // TODO: multigpu.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index ab2513cea..80fe06957 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -23,7 +23,7 @@ namespace Eigen {
   * leading to lvalues (slicing, reshaping, etc...)
   */
 
-template<typename Derived>
+template<typename Derived, typename Device>
 struct TensorEvaluator
 {
   typedef typename Derived::Index Index;
@@ -38,7 +38,7 @@ struct TensorEvaluator
     PacketAccess = Derived::PacketAccess,
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m)
+  EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m, const Device&)
       : m_data(const_cast<Scalar*>(m.data())), m_dims(m.dimensions())
   { }
 
@@ -73,8 +73,8 @@ struct TensorEvaluator
 
 // -------------------- CwiseNullaryOp --------------------
 
-template<typename NullaryOp, typename ArgType>
-struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType> >
+template<typename NullaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 {
   typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
 
@@ -84,14 +84,14 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType> >
   };
 
   EIGEN_DEVICE_FUNC
-  TensorEvaluator(const XprType& op)
-      : m_functor(op.functor()), m_argImpl(op.nestedExpression())
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
   { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
-  typedef typename TensorEvaluator<ArgType>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
@@ -108,32 +108,32 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType> >
 
  private:
   const NullaryOp m_functor;
-  TensorEvaluator<ArgType> m_argImpl;
+  TensorEvaluator<ArgType, Device> m_argImpl;
 };
 
 
 
 // -------------------- CwiseUnaryOp --------------------
 
-template<typename UnaryOp, typename ArgType>
-struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType> >
+template<typename UnaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
 {
   typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<ArgType>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op)
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
     : m_functor(op.functor()),
-      m_argImpl(op.nestedExpression())
+      m_argImpl(op.nestedExpression(), device)
   { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
-  typedef typename TensorEvaluator<ArgType>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
@@ -150,33 +150,33 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType> >
 
  private:
   const UnaryOp m_functor;
-  TensorEvaluator<ArgType> m_argImpl;
+  TensorEvaluator<ArgType, Device> m_argImpl;
 };
 
 
 // -------------------- CwiseBinaryOp --------------------
 
-template<typename BinaryOp, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> >
+template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device>
 {
   typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType>::IsAligned & TensorEvaluator<RightArgType>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType>::PacketAccess & TensorEvaluator<RightArgType>::PacketAccess &
+    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
                    internal::functor_traits<BinaryOp>::PacketAccess,
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op)
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
     : m_functor(op.functor()),
-      m_leftImpl(op.lhsExpression()),
-      m_rightImpl(op.rhsExpression())
+      m_leftImpl(op.lhsExpression(), device),
+      m_rightImpl(op.rhsExpression(), device)
   { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
-  typedef typename TensorEvaluator<LeftArgType>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -196,34 +196,34 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
 
  private:
   const BinaryOp m_functor;
-  TensorEvaluator<LeftArgType> m_leftImpl;
-  TensorEvaluator<RightArgType> m_rightImpl;
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
 };
 
 
 // -------------------- SelectOp --------------------
 
-template<typename IfArgType, typename ThenArgType, typename ElseArgType>
-struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> >
+template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
+struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
 {
   typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<ThenArgType>::IsAligned & TensorEvaluator<ElseArgType>::IsAligned,
-    PacketAccess = TensorEvaluator<ThenArgType>::PacketAccess & TensorEvaluator<ElseArgType>::PacketAccess/* &
+    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess/* &
                                                                                                              TensorEvaluator<IfArgType>::PacketAccess*/,
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op)
-    : m_condImpl(op.ifExpression()),
-      m_thenImpl(op.thenExpression()),
-      m_elseImpl(op.elseExpression())
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_condImpl(op.ifExpression(), device),
+      m_thenImpl(op.thenExpression(), device),
+      m_elseImpl(op.elseExpression(), device)
   { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
-  typedef typename TensorEvaluator<IfArgType>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -248,9 +248,9 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   }
 
  private:
-  TensorEvaluator<IfArgType> m_condImpl;
-  TensorEvaluator<ThenArgType> m_thenImpl;
-  TensorEvaluator<ElseArgType> m_elseImpl;
+  TensorEvaluator<IfArgType, Device> m_condImpl;
+  TensorEvaluator<ThenArgType, Device> m_thenImpl;
+  TensorEvaluator<ElseArgType, Device> m_elseImpl;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 1fb90478f..27bfe1d73 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -21,16 +21,17 @@ template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryO
 template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
 template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
 template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
+template<typename XprType> class TensorReductionOp;
 template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
 template<typename NewDimensions, typename XprType> class TensorReshapingOp;
-template<typename ExpressionType, typename DeviceType> class TensorDevice;
+template<typename XprType> class TensorForcedEvalOp;
 
-// Move to internal?
-template<typename Derived> struct TensorEvaluator;
+template<typename ExpressionType, typename DeviceType> class TensorDevice;
+template<typename Derived, typename Device> struct TensorEvaluator;
 
 namespace internal {
-template<typename Derived, typename OtherDerived, bool Vectorizable> struct TensorAssign;
+template<typename Derived, typename OtherDerived, typename Device, bool Vectorizable> struct TensorAssign;
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 7d5f9271e..e9e74581f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -77,19 +77,19 @@ class TensorReshapingOp : public TensorBase<TensorReshapingOp<XprType, NewDimens
 };
 
 
-template<typename ArgType, typename NewDimensions>
-struct TensorEvaluator<const TensorReshapingOp<ArgType, NewDimensions> >
+template<typename ArgType, typename NewDimensions, typename Device>
+struct TensorEvaluator<const TensorReshapingOp<ArgType, NewDimensions>, Device>
 {
   typedef TensorReshapingOp<ArgType, NewDimensions> XprType;
   typedef NewDimensions Dimensions;
 
   enum {
-    IsAligned = TensorEvaluator<ArgType>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType>::PacketAccess,
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
   };
 
-  TensorEvaluator(const XprType& op)
-      : m_impl(op.expression()), m_dimensions(op.dimensions())
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_dimensions(op.dimensions())
   { }
 
   typedef typename XprType::Index Index;
@@ -111,7 +111,7 @@ struct TensorEvaluator<const TensorReshapingOp<ArgType, NewDimensions> >
 
  private:
   NewDimensions m_dimensions;
-  TensorEvaluator<ArgType> m_impl;
+  TensorEvaluator<ArgType, Device> m_impl;
 };
 
 
-- 
cgit v1.2.3


From 38ab7e6ed0491bd5a0c639f218d5ea4728bf1e81 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 13 Jun 2014 09:56:51 -0700
Subject: Reworked the expression evaluation mechanism in order to make it
 possible to efficiently compute convolutions and contractions in the future: 
 * The scheduling of computation is moved out the the assignment code and into
 a new TensorExecutor class  * The assignment itself is now a regular node on
 the expression tree  * The expression evaluators start by recursively
 evaluating all their subexpressions if needed

---
 unsupported/Eigen/CXX11/Tensor                     |   4 +
 unsupported/Eigen/CXX11/src/Tensor/Tensor.h        |   4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h  | 250 +++++++++------------
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     |   8 +
 .../Eigen/CXX11/src/Tensor/TensorConvolution.h     |   9 +
 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h  |  16 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h  | 146 ++++++++++++
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       |  56 ++++-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        | 194 ++++++++++++++++
 .../Eigen/CXX11/src/Tensor/TensorFixedSize.h       |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorForcedEval.h      | 142 ++++++++++++
 .../CXX11/src/Tensor/TensorForwardDeclarations.h   |   5 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorMap.h     |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h        |   7 +
 14 files changed, 685 insertions(+), 164 deletions(-)
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index c67020581..7e504b302 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -42,8 +42,12 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
+
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 7f614bbe8..09601fc7d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -236,7 +236,9 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
       // FIXME: we need to resize the tensor to fix the dimensions of the other.
       // Unfortunately this isn't possible yet when the rhs is an expression.
       // resize(other.dimensions());
-      internal::TensorAssign<Tensor, const OtherDerived>::run(*this, other);
+      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 633a7a31b..a2a925775 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -10,10 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
 #define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
 
-#ifdef EIGEN_USE_THREADS
-#include <future>
-#endif
-
 namespace Eigen {
 
 /** \class TensorAssign
@@ -21,172 +17,134 @@ namespace Eigen {
   *
   * \brief The tensor assignment class.
   *
-  * This class is responsible for triggering the evaluation of the expressions
-  * used on the lhs and rhs of an assignment operator and copy the result of
-  * the evaluation of the rhs expression at the address computed during the
-  * evaluation lhs expression.
-  *
-  * TODO: vectorization. For now the code only uses scalars
-  * TODO: parallelisation using multithreading on cpu, or kernels on gpu.
+  * This class is represents the assignment of the values resulting from the evaluation of
+  * the rhs expression to the memory locations denoted by the lhs expression.
   */
 namespace internal {
-
-// Default strategy: the expressions are evaluated with a single cpu thread.
-template<typename Derived1, typename Derived2, typename Device = DefaultDevice, bool Vectorizable = TensorEvaluator<Derived1, Device>::PacketAccess & TensorEvaluator<Derived2, Device>::PacketAccess>
-struct TensorAssign
+template<typename LhsXprType, typename RhsXprType>
+struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
 {
-  typedef typename Derived1::Index Index;
-  EIGEN_DEVICE_FUNC
-  static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device())
-  {
-    TensorEvaluator<Derived1, Device> evalDst(dst, device);
-    TensorEvaluator<Derived2, Device> evalSrc(src, device);
-    const Index size = dst.size();
-    for (Index i = 0; i < size; ++i) {
-      evalDst.coeffRef(i) = evalSrc.coeff(i);
-    }
-  }
+  typedef typename LhsXprType::Scalar Scalar;
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename traits<LhsXprType>::StorageKind StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+
+  enum {
+    Flags = 0,
+  };
 };
 
+template<typename LhsXprType, typename RhsXprType>
+struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
+};
 
-template<typename Derived1, typename Derived2, typename Device>
-struct TensorAssign<Derived1, Derived2, Device, true>
+template<typename LhsXprType, typename RhsXprType>
+struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type>
 {
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1& dst, const Derived2& src, const Device& device = Device())
-  {
-    TensorEvaluator<Derived1, Device> evalDst(dst, device);
-    TensorEvaluator<Derived2, Device> evalSrc(src, device);
-    const Index size = dst.size();
-
-    static const int LhsStoreMode = TensorEvaluator<Derived1, Device>::IsAligned ? Aligned : Unaligned;
-    static const int RhsLoadMode = TensorEvaluator<Derived2, Device>::IsAligned ? Aligned : Unaligned;
-    static const int PacketSize = unpacket_traits<typename TensorEvaluator<Derived1, Device>::PacketReturnType>::size;
-    const int VectorizedSize = (size / PacketSize) * PacketSize;
-
-    for (Index i = 0; i < VectorizedSize; i += PacketSize) {
-      evalDst.template writePacket<LhsStoreMode>(i, evalSrc.template packet<RhsLoadMode>(i));
-    }
-    for (Index i = VectorizedSize; i < size; ++i) {
-      evalDst.coeffRef(i) = evalSrc.coeff(i);
-    }
-  }
+  typedef TensorAssignOp<LhsXprType, RhsXprType> type;
 };
 
+}  // end namespace internal
 
 
-// Multicore strategy: the index space is partitioned and each core is assigned to a partition
-#ifdef EIGEN_USE_THREADS
-template <typename LhsEval, typename RhsEval, typename Index, bool Vectorizable = LhsEval::PacketAccess & RhsEval::PacketAccess>
-struct EvalRange {
-  static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) {
-    eigen_assert(last > first);
-    for (Index i = first; i < last; ++i) {
-      dst.coeffRef(i) = src.coeff(i);
-    }
-  }
-};
 
-template <typename LhsEval, typename RhsEval, typename Index>
-struct EvalRange<LhsEval, RhsEval, Index, true> {
-  static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) {
-    eigen_assert(last > first);
-
-    Index i = first;
-    static const int PacketSize = unpacket_traits<typename LhsEval::PacketReturnType>::size;
-    if (last - first > PacketSize) {
-      static const int LhsStoreMode = LhsEval::IsAligned ? Aligned : Unaligned;
-      static const int RhsLoadMode = RhsEval::IsAligned ? Aligned : Unaligned;
-      eigen_assert(first % PacketSize == 0);
-      Index lastPacket = last - (last % PacketSize);
-      for (; i < lastPacket; i += PacketSize) {
-        dst.template writePacket<LhsStoreMode>(i, src.template packet<RhsLoadMode>(i));
-      }
-    }
-
-    for (; i < last; ++i) {
-      dst.coeffRef(i) = src.coeff(i);
-    }
-  }
+template<typename LhsXprType, typename RhsXprType>
+class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
+  typedef typename LhsXprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+  protected:
+    typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr;
+    const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr;
 };
 
-template<typename Derived1, typename Derived2>
-struct TensorAssignMultiThreaded
+
+template<typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
 {
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1& dst, const Derived2& src, const ThreadPoolDevice& device)
+  typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      m_leftImpl(op.lhsExpression(), device),
+      m_rightImpl(op.rhsExpression(), device)
+  { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
-    TensorEvaluator<Derived1, DefaultDevice> evalDst(dst, DefaultDevice());
-    TensorEvaluator<Derived2, DefaultDevice> evalSrc(src, Defaultevice());
-    const Index size = dst.size();
-
-    static const bool Vectorizable = TensorEvaluator<Derived1, DefaultDevice>::PacketAccess & TensorEvaluator<Derived2, DefaultDevice>::PacketAccess;
-    static const int PacketSize = Vectorizable ? unpacket_traits<typename TensorEvaluator<Derived1, DefaultDevice>::PacketReturnType>::size : 1;
-
-    int blocksz = static_cast<int>(ceil(static_cast<float>(size)/device.numThreads()) + PacketSize - 1);
-    const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
-    const Index numblocks = size / blocksize;
-
-    Index i = 0;
-    vector<std::future<void> > results;
-    results.reserve(numblocks);
-    for (int i = 0; i < numblocks; ++i) {
-      results.push_back(std::async(std::launch::async, &EvalRange<TensorEvaluator<Derived1, DefaultDevice>, TensorEvaluator<Derived2, DefaultDevice>, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize));
-    }
-
-    for (int i = 0; i < numblocks; ++i) {
-      results[i].get();
-    }
-
-    if (numblocks * blocksize < size) {
-      EvalRange<TensorEvaluator<Derived1>, TensorEvaluator<Derived2>, Index>::run(evalDst, evalSrc, numblocks * blocksize, size);
-    }
+    // TODO: use left impl instead if right impl dimensions are known at compile time.
+    return m_rightImpl.dimensions();
   }
-};
-#endif
 
-
-// GPU: the evaluation of the expressions is offloaded to a GPU.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-template <typename LhsEvaluator, typename RhsEvaluator>
-__global__ void EigenMetaKernelNoCheck(LhsEvaluator evalDst, const RhsEvaluator evalSrc) {
-  const int index = blockIdx.x * blockDim.x + threadIdx.x;
-  evalDst.coeffRef(index) = evalSrc.coeff(index);
-}
-template <typename LhsEvaluator, typename RhsEvaluator>
-__global__ void EigenMetaKernelPeel(LhsEvaluator evalDst, const RhsEvaluator evalSrc, int peel_start_offset, int size) {
-  const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < size) {
-    evalDst.coeffRef(index) = evalSrc.coeff(index);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
+    m_leftImpl.evalSubExprsIfNeeded();
+    m_rightImpl.evalSubExprsIfNeeded();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
   }
-}
 
-template<typename Derived1, typename Derived2>
-struct TensorAssignGpu
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1& dst, const Derived2& src, const GpuDevice& device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+    m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+    static const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
+    static const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
+    m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
+  }
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
-    TensorEvaluator<Derived1, GpuDevice> evalDst(dst, device);
-    TensorEvaluator<Derived2, GpuDevice> evalSrc(src, device);
-    const Index size = dst.size();
-    const int block_size = std::min<int>(size, 32*32);
-    const int num_blocks = size / block_size;
-    EigenMetaKernelNoCheck<TensorEvaluator<Derived1, GpuDevice>, TensorEvaluator<Derived2, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evalDst, evalSrc);
-
-    const int remaining_items = size % block_size;
-    if (remaining_items > 0) {
-      const int peel_start_offset = num_blocks * block_size;
-      const int peel_block_size = std::min<int>(size, 32);
-      const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size;
-      EigenMetaKernelPeel<TensorEvaluator<Derived1, GpuDevice>, TensorEvaluator<Derived2, GpuDevice> > <<<peel_num_blocks, peel_block_size, 0, device.stream()>>>(evalDst, evalSrc, peel_start_offset, size);
-    }
+    return m_leftImpl.coeff(index);
   }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  {
+    return m_leftImpl.template packet<LoadMode>(index);
+  }
+
+ private:
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
 };
-#endif
 
-} // end namespace internal
+}
 
-} // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index cadbabda2..b2e12fd15 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -184,6 +184,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       buffer[i] += coeff(i);
     }
   }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
+    m_leftImpl.evalSubExprsIfNeeded();
+    m_rightImpl.evalSubExprsIfNeeded();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index c4cfe0cd8..58b1808a3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -153,6 +153,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
   const Dimensions& dimensions() const { return m_dimensions; }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
+    m_inputImpl.evalSubExprsIfNeeded();
+    m_kernelImpl.evalSubExprsIfNeeded();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    m_kernelImpl.cleanup();
+  }
+
   void evalTo(typename XprType::Scalar* buffer) const {
     for (int i = 0; i < dimensions().TotalSize(); ++i) {
       buffer[i] += coeff(i);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index ce524a818..75519c9f5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -31,7 +31,10 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
 
     template<typename OtherDerived>
     EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
-      internal::TensorAssign<ExpressionType, const OtherDerived, DeviceType>::run(m_expression, other, m_device);
+      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+      Assign assign(m_expression, other);
+      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
+      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
       return *this;
     }
 
@@ -48,7 +51,10 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
 
     template<typename OtherDerived>
     EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
-      internal::TensorAssignMultiThreaded<ExpressionType, const OtherDerived>::run(m_expression, other, m_device);
+      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+      Assign assign(m_expression, other);
+      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
+      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
       return *this;
     }
 
@@ -67,13 +73,15 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
 
     template<typename OtherDerived>
     EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
-      internal::TensorAssignGpu<ExpressionType, const OtherDerived>::run(m_expression, other, m_device);
+      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+      Assign assign(m_expression, other);
+      internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
       return *this;
     }
 
   protected:
     const GpuDevice& m_device;
-    ExpressionType& m_expression;
+    ExpressionType m_expression;
 };
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
new file mode 100644
index 000000000..db716a80e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -0,0 +1,146 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+namespace internal {
+template<typename XprType>
+struct traits<TensorEvalToOp<XprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename XprType>
+struct eval<TensorEvalToOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorEvalToOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType> >::type>
+{
+  typedef TensorEvalToOp<XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+
+template<typename XprType>
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(Scalar* buffer, const XprType& expr)
+      : m_xpr(expr), m_buffer(buffer) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC Scalar* buffer() const { return m_buffer; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    Scalar* m_buffer;
+};
+
+
+
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
+{
+  typedef TensorEvalToOp<ArgType> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename ArgType::Packet Packet;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = true,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer())
+  { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
+    m_impl.evalSubExprsIfNeeded();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+    m_buffer[i] = m_impl.coeff(i);
+  }
+  EIGEN_STRONG_INLINE void evalPacket(Index i) {
+    internal::pstoret<Scalar, Packet, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_buffer[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt<Packet, LoadMode>(m_buffer + index);
+  }
+
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  Scalar* m_buffer;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 80fe06957..5c8b079da 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -38,27 +38,32 @@ struct TensorEvaluator
     PacketAccess = Derived::PacketAccess,
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(Derived& m, const Device&)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(Derived& m, const Device&)
       : m_data(const_cast<Scalar*>(m.data())), m_dims(m.dimensions())
   { }
 
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dims; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data);
     return m_data[index];
   }
 
-  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    eigen_assert(m_data);
     return m_data[index];
   }
 
-  template<int LoadMode>
+  template<int LoadMode> EIGEN_STRONG_INLINE
   PacketReturnType packet(Index index) const
   {
     return internal::ploadt<Packet, LoadMode>(m_data + index);
   }
 
-  template <int StoreMode>
+  template <int StoreMode> EIGEN_STRONG_INLINE
   void writePacket(Index index, const Packet& x)
   {
     return internal::pstoret<Scalar, Packet, StoreMode>(m_data + index, x);
@@ -95,13 +100,16 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_functor(index);
   }
 
   template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(index);
   }
@@ -137,13 +145,20 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
+    m_argImpl.evalSubExprsIfNeeded();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_argImpl.cleanup();
+  }
+
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_functor(m_argImpl.coeff(index));
   }
 
   template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
   }
@@ -184,12 +199,21 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     return m_leftImpl.dimensions();
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
+    m_leftImpl.evalSubExprsIfNeeded();
+    m_rightImpl.evalSubExprsIfNeeded();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
   }
   template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
   }
@@ -230,12 +254,24 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     // TODO: use then or else impl instead if they happen to be known at compile time.
     return m_condImpl.dimensions();
   }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
+    m_condImpl.evalSubExprsIfNeeded();
+    m_thenImpl.evalSubExprsIfNeeded();
+    m_elseImpl.evalSubExprsIfNeeded();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_condImpl.cleanup();
+    m_thenImpl.cleanup();
+    m_elseImpl.cleanup();
+  }
+
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
   }
   template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  PacketReturnType packet(Index index) const
   {
     static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
     internal::Selector<PacketSize> select;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
new file mode 100644
index 000000000..3e41f3290
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -0,0 +1,194 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+
+#ifdef EIGEN_USE_THREADS
+#include <future>
+#endif
+
+namespace Eigen {
+
+/** \class TensorExecutor
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor executor class.
+  *
+  * This class is responsible for launch the evaluation of the expression on
+  * the specified computing device.
+  */
+namespace internal {
+
+// Default strategy: the expression is evaluated with a single cpu thread.
+template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = TensorEvaluator<Expression, Device>::PacketAccess>
+struct TensorExecutor
+{
+  typedef typename Expression::Index Index;
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Expression& expr, const Device& device = Device())
+  {
+    TensorEvaluator<Expression, Device> evaluator(expr, device);
+    evaluator.evalSubExprsIfNeeded();
+
+    const Index size = evaluator.dimensions().TotalSize();
+    for (Index i = 0; i < size; ++i) {
+      evaluator.evalScalar(i);
+    }
+
+    evaluator.cleanup();
+  }
+};
+
+
+template<typename Expression>
+struct TensorExecutor<Expression, DefaultDevice, true>
+{
+  typedef typename Expression::Index Index;
+  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
+  {
+    TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
+    evaluator.evalSubExprsIfNeeded();
+
+    const Index size = evaluator.dimensions().TotalSize();
+    static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+    const int VectorizedSize = (size / PacketSize) * PacketSize;
+
+    for (Index i = 0; i < VectorizedSize; i += PacketSize) {
+      evaluator.evalPacket(i);
+    }
+    for (Index i = VectorizedSize; i < size; ++i) {
+      evaluator.evalScalar(i);
+    }
+
+    evaluator.cleanup();
+  }
+};
+
+
+
+// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+#ifdef EIGEN_USE_THREADS
+template <typename Evaluator, typename Index, bool Vectorizable = Evaluator::PacketAccess>
+struct EvalRange {
+  static void run(Evaluator& evaluator, const Index first, const Index last) {
+    eigen_assert(last > first);
+    for (Index i = first; i < last; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename Index>
+struct EvalRange<Evaluator, Index, true> {
+  static void run(Evaluator& evaluator, const Index first, const Index last,) {
+    eigen_assert(last > first);
+
+    Index i = first;
+    static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    if (last - first > PacketSize) {
+      eigen_assert(first % PacketSize == 0);
+      Index lastPacket = last - (last % PacketSize);
+      for (; i < lastPacket; i += PacketSize) {
+        evaluator.evalPacket(i);
+      }
+    }
+
+    for (; i < last; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+};
+
+template<typename Expression, bool Vectorizable>
+struct TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
+{
+  typedef typename Expression::Index Index;
+  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
+  {
+    TensorEvaluator<Expression, ThreadPoolDevice> evaluator(expr, device);
+    evaluator.evalSubExprsIfNeeded();
+
+    const Index size = evaluator.dimensions().TotalSize();
+
+    static const int PacketSize = Vectorizable ? unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size : 1;
+
+    int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
+    const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+    const Index numblocks = size / blocksize;
+
+    TensorEvaluator<Expression, DefaultDevice> single_threaded_eval(expr, DefaultDevice());
+
+    Index i = 0;
+    vector<std::future<void> > results;
+    results.reserve(numblocks);
+    for (int i = 0; i < numblocks; ++i) {
+      results.push_back(std::async(std::launch::async, &EvalRange<TensorEvaluator<Expression, DefaultDevice>, Index>::run, single_threaded_eval, i*blocksize, (i+1)*blocksize));
+    }
+
+    for (int i = 0; i < numblocks; ++i) {
+      results[i].get();
+    }
+
+    if (numblocks * blocksize < size) {
+      EvalRange<TensorEvaluator<Expression, DefaultDevice>, Index>::run(single_threaded_eval, numblocks * blocksize, size, nullptr);
+    }
+
+    evaluator.cleanup();
+  }
+};
+#endif
+
+
+// GPU: the evaluation of the expression is offloaded to a GPU.
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <typename Evaluator>
+__global__ void EigenMetaKernelNoCheck(Evaluator eval) {
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+  eval.evalScalar(index);
+}
+template <typename Evaluator>
+__global__ void EigenMetaKernelPeel(Evaluator eval, int peel_start_offset, int size) {
+  const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    eval.evalScalar(index);
+  }
+}
+
+template<typename Expression, bool Vectorizable>
+struct TensorExecutor<Expression, GpuDevice, Vectorizable>
+{
+  typedef typename Expression::Index Index;
+  static inline void run(const Expression& expr, const GpuDevice& device)
+  {
+    TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+    evaluator.evalSubExprsIfNeeded();
+
+    const Index size = evaluator.dimensions().TotalSize();
+    const int block_size = std::min<int>(size, 32*32);
+    const int num_blocks = size / block_size;
+    EigenMetaKernelNoCheck<TensorEvaluator<Expression, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evaluator);
+
+    const int remaining_items = size % block_size;
+    if (remaining_items > 0) {
+      const int peel_start_offset = num_blocks * block_size;
+      const int peel_block_size = std::min<int>(size, 32);
+      const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size;
+      EigenMetaKernelPeel<TensorEvaluator<Expression, GpuDevice> > <<<peel_num_blocks, peel_block_size, 0, device.stream()>>>(evaluator, peel_start_offset, size);
+    }
+    evaluator.cleanup();
+  }
+};
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 789c04238..d42167da9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -200,7 +200,9 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     {
       // FIXME: check that the dimensions of other match the dimensions of *this.
       // Unfortunately this isn't possible yet when the rhs is an expression.
-      internal::TensorAssign<TensorFixedSize, const OtherDerived>::run(*this, other);
+      typedef TensorAssignOp<Self, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
new file mode 100644
index 000000000..6f6641de6
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -0,0 +1,142 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+namespace internal {
+template<typename XprType>
+struct traits<TensorForcedEvalOp<XprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+
+  enum {
+    Flags = 0,
+  };
+};
+
+template<typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
+{
+  typedef const TensorForcedEvalOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
+{
+  typedef TensorForcedEvalOp<XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> >
+{
+  public:
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Packet Packet;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr)
+      : m_xpr(expr) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+};
+
+
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
+{
+  typedef TensorForcedEvalOp<ArgType> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename ArgType::Packet Packet;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = true,
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
+  { }
+
+  EIGEN_DEVICE_FUNC ~TensorEvaluator() {
+    eigen_assert(!m_buffer);
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
+    m_impl.evalSubExprsIfNeeded();
+    m_buffer = (Scalar*)m_device.allocate(m_impl.dimensions().TotalSize() * sizeof(Scalar));
+
+    typedef TensorEvalToOp<const ArgType> EvalTo;
+    EvalTo evalToTmp(m_buffer, m_op);
+    internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<ArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
+    m_impl.cleanup();
+  }
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_device.deallocate(m_buffer);
+    m_buffer = NULL;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_buffer[index];
+  }
+
+  template<int LoadMode>
+  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return internal::ploadt<Packet, LoadMode>(m_buffer + index);
+  }
+
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const ArgType m_op;
+  const Device& m_device;
+  Scalar* m_buffer;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 27bfe1d73..c0dffbd0c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -25,13 +25,16 @@ template<typename XprType> class TensorReductionOp;
 template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
 template<typename NewDimensions, typename XprType> class TensorReshapingOp;
+template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
+
+template<typename XprType> class TensorEvalToOp;
 template<typename XprType> class TensorForcedEvalOp;
 
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
 
 namespace internal {
-template<typename Derived, typename OtherDerived, typename Device, bool Vectorizable> struct TensorAssign;
+template<typename Expression, typename Device, bool Vectorizable> class TensorExecutor;
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 3a06170fa..c97135b63 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -246,7 +246,9 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
     EIGEN_DEVICE_FUNC
     Self& operator=(const OtherDerived& other)
     {
-      internal::TensorAssign<Self, const OtherDerived>::run(*this, other);
+      typedef TensorAssignOp<Self, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
       return *this;
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index e9e74581f..764bba4e6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -98,6 +98,13 @@ struct TensorEvaluator<const TensorReshapingOp<ArgType, NewDimensions>, Device>
 
   const Dimensions& dimensions() const { return m_dimensions; }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
+    m_impl.evalSubExprsIfNeeded();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
     return m_impl.coeff(index);
-- 
cgit v1.2.3


From f8fad09301106c574ed88ffde52e15483d14673f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 13 Aug 2014 08:33:18 -0700
Subject: Updated the convolution and contraction evaluators to follow the new
 EvalSubExprsIfNeeded apu.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h |  7 ++++---
 unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 11 ++++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index b2e12fd15..8d7a1351e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -184,9 +184,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       buffer[i] += coeff(i);
     }
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
-    m_leftImpl.evalSubExprsIfNeeded();
-    m_rightImpl.evalSubExprsIfNeeded();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded(Scalar*) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 4bdf74286..8864c5329 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -110,7 +110,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
                    false,
   };
 
-  TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_dimensions(op.inputExpression().dimensions())
   {
     const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
@@ -151,11 +151,12 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
-  const Dimensions& dimensions() const { return m_dimensions; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
-    m_inputImpl.evalSubExprsIfNeeded();
-    m_kernelImpl.evalSubExprsIfNeeded();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded(Scalar*) {
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    m_kernelImpl.evalSubExprsIfNeeded(NULL);
+    return true;
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_inputImpl.cleanup();
-- 
cgit v1.2.3


From 9faad2932feaa59614e3b1935a179e0aabd34bbd Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 13 Aug 2014 08:36:33 -0700
Subject: Added missing apis.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 +++-
 unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 8d7a1351e..b2969337f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -173,7 +173,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
   }
 
-  //  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
@@ -226,6 +226,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
   }
 
+  Scalar* data() const { return NULL; }
+
  private:
   array<Index, 2*internal::array_size<Indices>::value> m_leftOffsets;
   array<Index, 2*internal::array_size<Indices>::value> m_rightOffsets;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 8864c5329..e3068dcae 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -148,6 +148,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     }
   }
 
+  typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
@@ -203,6 +204,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     }
   }
 
+  Scalar* data() const { return NULL; }
+
  private:
   array<Index, NumDims> m_inputStride;
   array<Index, NumDims> m_outputStride;
-- 
cgit v1.2.3


From f1d8c13dbcbe38938dcd727f9b50339a981197c3 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 13 Aug 2014 08:40:26 -0700
Subject: Fixed misc typos.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index b2969337f..897d73806 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -184,7 +184,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       buffer[i] += coeff(i);
     }
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     return true;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index e3068dcae..4158271c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -154,7 +154,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     m_inputImpl.evalSubExprsIfNeeded(NULL);
     m_kernelImpl.evalSubExprsIfNeeded(NULL);
     return true;
-- 
cgit v1.2.3


From efdff157493826bbcc023a85e08596fd58d7997a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Sat, 6 Sep 2014 13:28:24 -0700
Subject: Fixed a typo in the contraction code

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 897d73806..46624724c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -168,7 +168,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     }
 
     // Scalar case
-    if (TensorEvaluator<LeftArgType, Device>::Dimensions::count + TensorEvaluator<LeftArgType, Device>::Dimensions::count == 2 * internal::array_size<Indices>::value) {
+    if (TensorEvaluator<LeftArgType, Device>::Dimensions::count + TensorEvaluator<RightArgType, Device>::Dimensions::count == 2 * internal::array_size<Indices>::value) {
       m_dimensions[0] = 1;
     }
   }
-- 
cgit v1.2.3


From 12693928228922ecf8fa3fcf14341d195e376a11 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 3 Oct 2014 10:16:59 -0700
Subject: Created the IndexPair type to store pair of tensor indices. CUDA
 doesn't support std::pair so we can't use them when targeting GPUs. Improved
 the performance on tensor contractions

---
 .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h   |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     | 729 ++++++++++++++++++---
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h      |   7 +
 3 files changed, 656 insertions(+), 84 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
index 3812ecd1f..227522ecb 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
@@ -69,11 +69,13 @@ template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_
 #undef STD_GET_ARR_HACK
 
 template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<std::array<T,N> > {
+  static const size_t value = N;
+};
 template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
   static const size_t value = N;
 };
 
-
 /* Suppose you have a template of the form
  * template<typename T> struct X;
  * And you want to specialize it in such a way:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 46624724c..1e6f276e0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -20,6 +20,319 @@ namespace Eigen {
   *
   */
 namespace internal {
+
+enum {
+  Rhs = 0,
+  Lhs = 1,
+};
+
+/*
+ * Implementation of the Eigen blas_data_mapper class for tensors.
+ */
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size, bool inner_dim_contiguous>
+class BaseTensorContractionMapper {
+  public:
+  EIGEN_DEVICE_FUNC
+  BaseTensorContractionMapper(const Tensor& tensor,
+                              const nocontract_t& nocontract_strides,
+                              const nocontract_t& ij_strides,
+                              const contract_t& contract_strides,
+                              const contract_t& k_strides) :
+      m_tensor(tensor),
+      m_nocontract_strides(nocontract_strides),
+      m_ij_strides(ij_strides),
+      m_contract_strides(contract_strides),
+      m_k_strides(k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE void prefetch(int i) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+    // column major assumption
+    return operator()(row, 0);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+    return m_tensor.coeff(computeIndex(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val = left ? row : col;
+    Index linidx = 0;
+    for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) {
+      const Index idx = nocontract_val / m_ij_strides[i];
+      linidx += idx * m_nocontract_strides[i];
+      nocontract_val -= idx * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx += nocontract_val;
+      } else {
+        linidx += nocontract_val * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val = left ? col : row;
+    for (int i = array_size<contract_t>::value - 1; i > 0; i--) {
+      const Index idx = contract_val / m_k_strides[i];
+      linidx += idx * m_contract_strides[i];
+      contract_val -= idx * m_k_strides[i];
+    }
+    EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    if (side == Rhs && inner_dim_contiguous) {
+      eigen_assert(m_contract_strides[0] == 1);
+      linidx += contract_val;
+    } else {
+      linidx += contract_val * m_contract_strides[0];
+    }
+
+    return linidx;
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
+    const bool left = (side == Lhs);
+    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+    Index linidx[2] = {0, 0};
+    for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) {
+      const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+      const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+      linidx[0] += idx0 * m_nocontract_strides[i];
+      linidx[1] += idx1 * m_nocontract_strides[i];
+      nocontract_val[0] -= idx0 * m_ij_strides[i];
+      nocontract_val[1] -= idx1 * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx[0] += nocontract_val[0];
+        linidx[1] += nocontract_val[1];
+      } else {
+        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+    for (int i = array_size<contract_t>::value - 1; i > 0; i--) {
+      const Index idx0 = contract_val[0] / m_k_strides[i];
+      const Index idx1 = contract_val[1] / m_k_strides[i];
+      linidx[0] += idx0 * m_contract_strides[i];
+      linidx[1] += idx1 * m_contract_strides[i];
+      contract_val[0] -= idx0 * m_k_strides[i];
+      contract_val[1] -= idx1 * m_k_strides[i];
+    }
+    EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    if (side == Rhs && inner_dim_contiguous) {
+      eigen_assert(m_contract_strides[0] == 1);
+      linidx[0] += contract_val[0];
+      linidx[1] += contract_val[1];
+    } else {
+      linidx[0] += contract_val[0] * m_contract_strides[0];
+      linidx[1] += contract_val[1] * m_contract_strides[0];
+    }
+    return IndexPair<Index>(linidx[0], linidx[1]);
+  }
+
+ protected:
+  const Tensor m_tensor;
+  const nocontract_t m_nocontract_strides;
+  const nocontract_t m_ij_strides;
+  const contract_t m_contract_strides;
+  const contract_t m_k_strides;
+};
+
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper;
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionSubMapper {
+ public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef Self LinearMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+   return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+ private:
+  const ParentMapper& m_base_mapper;
+  const Index m_vert_offset;
+  const Index m_horiz_offset;
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         size_t packet_size = (Tensor::PacketAccess ? packet_traits<Scalar>::size : 1),
+         bool inner_dim_contiguous = false, bool inner_dim_reordered = (side != Lhs), int Alignment=Unaligned>
+class TensorContractionInputMapper
+    : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> {
+
+ public:
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+
+  TensorContractionInputMapper(const Tensor& tensor,
+                               const nocontract_t& nocontract_strides,
+                               const nocontract_t& ij_strides,
+                               const contract_t& contract_strides,
+                               const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    // current code assumes packet size must be a multiple of 2
+    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+      const Index index = this->computeIndex(i, j);
+      eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
+      return this->m_tensor.template packet<Alignment>(index);
+    }
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+    const Index first = indexPair.first;
+    const Index last = indexPair.second;
+
+    // We can always do optimized packet reads from left hand side right now, because
+    // the vertical matrix dimension on the left hand side is never contracting.
+    // On the right hand side we need to check if the contracting dimensions may have
+    // been shuffled first.
+    if (Tensor::PacketAccess &&
+        (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+        (last - first) == (packet_size - 1)) {
+
+      return this->m_tensor.template packet<Alignment>(first);
+    }
+
+    EIGEN_ALIGN_DEFAULT Scalar data[packet_size];
+
+    data[0] = this->m_tensor.coeff(first);
+    for (Index k = 1; k < packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[packet_size - 1] = this->m_tensor.coeff(last);
+
+    return pload<Packet>(data);
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    const Index half_packet_size = unpacket_traits<HalfPacket>::size;
+    if (half_packet_size == packet_size) {
+      return loadPacket(i, j);
+    }
+    EIGEN_ALIGN_DEFAULT Scalar data[half_packet_size];
+    for (Index k = 0; k < half_packet_size; k++) {
+      data[k] = operator()(i + k, j);
+    }
+    return pload<HalfPacket>(data);
+  }
+};
+
+
+template<typename Scalar, typename Index, int side,
+         typename Tensor,
+         typename nocontract_t, typename contract_t,
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    : public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous>  {
+
+ public:
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+
+  TensorContractionInputMapper(const Tensor& tensor,
+                               const nocontract_t& nocontract_strides,
+                               const nocontract_t& ij_strides,
+                               const contract_t& contract_strides,
+                               const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+    EIGEN_ALIGN_DEFAULT Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<typename packet_traits<Scalar>::type>(data);
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
+    return loadPacket(i, j);
+  }
+};
+
+
 template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
@@ -53,6 +366,14 @@ struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typena
   typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
 };
 
+template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
+struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
+  typedef Indices_ Indices;
+  typedef LeftArgType_ LeftArgType;
+  typedef RightArgType_ RightArgType;
+  typedef Device_ Device;
+};
+
 }  // end namespace internal
 
 
@@ -102,143 +423,385 @@ template <> struct max_n_1<0> {
 };
 
 
-template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device>
+template<typename Derived>
+struct TensorContractionEvaluatorBase
 {
+  typedef typename internal::traits<Derived>::Indices Indices;
+  typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
+  typedef typename internal::traits<Derived>::RightArgType RightArgType;
+  typedef typename internal::traits<Derived>::Device Device;
+
   typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Packet Packet;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  typedef array<Index, TensorEvaluator<LeftArgType, Device>::Dimensions::count> left_dim_mapper_t;
+  typedef array<Index, TensorEvaluator<RightArgType, Device>::Dimensions::count> right_dim_mapper_t;
+
+  typedef array<Index, internal::array_size<Indices>::value> contract_t;
+  typedef array<Index, max_n_1<TensorEvaluator<LeftArgType, Device>::Dimensions::count - internal::array_size<Indices>::value>::size> left_nocontract_t;
+  typedef array<Index, max_n_1<TensorEvaluator<RightArgType, Device>::Dimensions::count - internal::array_size<Indices>::value>::size> right_nocontract_t;
 
   static const int NumDims = max_n_1<TensorEvaluator<LeftArgType, Device>::Dimensions::count + TensorEvaluator<RightArgType, Device>::Dimensions::count - 2 * internal::array_size<Indices>::value>::size;
-  typedef typename XprType::Index Index;
+
   typedef DSizes<Index, NumDims> Dimensions;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = /*TensorEvaluator<LeftArgType>::PacketAccess & TensorEvaluator<RightArgType>::PacketAccess */
-                   false,
+    IsAligned = true,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
   };
 
-  TensorEvaluator(const XprType& op, const Device& device)
-      : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device)
+      : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_device(device), m_result(NULL)
   {
-    Index index = 0;
-    Index stride = 1;
-    m_shiftright = 1;
+    eigen_assert((internal::array_size<contract_t>::value > 0) && "Must contract on some indices");
+
+    array<Index, TensorEvaluator<LeftArgType, Device>::Dimensions::count> lhs_strides;
+    lhs_strides[0] = 1;
+    for (int i = 0; i < TensorEvaluator<LeftArgType, Device>::Dimensions::count-1; ++i) {
+      lhs_strides[i+1] = lhs_strides[i] * m_leftImpl.dimensions()[i];
+    }
+
+    array<Index, TensorEvaluator<RightArgType, Device>::Dimensions::count> rhs_strides;
+    rhs_strides[0] = 1;
+    for (int i = 0; i < TensorEvaluator<RightArgType, Device>::Dimensions::count-1; ++i) {
+      rhs_strides[i+1] = rhs_strides[i] * m_rightImpl.dimensions()[i];
+    }
 
-    int skipped = 0;
+    m_i_strides[0] = 1;
+    m_j_strides[0] = 1;
+    m_k_strides[0] = 1;
+
+    m_i_size = 1;
+    m_j_size = 1;
+    m_k_size = 1;
+
+    // To compute the dimension, we simply concatenate the non-contracting
+    // dimensions of the left and then the right tensor. Additionally, we also
+    // compute the strides corresponding to the left non-contracting
+    // dimensions and right non-contracting dimensions.
+    m_lhs_inner_dim_contiguous = true;
+    int dim_idx = 0;
+    int nocontract_idx = 0;
     const typename TensorEvaluator<LeftArgType, Device>::Dimensions& left_dims = m_leftImpl.dimensions();
-    for (int i = 0; i < TensorEvaluator<LeftArgType, Device>::Dimensions::count; ++i) {
-      bool skip = false;
-      for (int j = 0; j < internal::array_size<Indices>::value; ++j) {
+    for (int i = 0; i < TensorEvaluator<LeftArgType, Device>::Dimensions::count; i++) {
+      // find if we are contracting on index i of left tensor
+      bool contracting = false;
+      for (int j = 0; j < internal::array_size<Indices>::value; j++) {
         if (op.indices()[j].first == i) {
-          skip = true;
-          m_leftOffsets[2*skipped] = stride;
-          m_leftOffsets[2*skipped+1] = stride * left_dims[i];
-          m_stitchsize[skipped] = left_dims[i];
+          contracting = true;
           break;
         }
       }
-      if (!skip) {
-        m_dimensions[index++] = left_dims[i];
-        m_shiftright *= left_dims[i];
-      } else {
-        ++skipped;
+      if (!contracting) {
+        // add dimension size to output dimensions
+        m_dimensions[dim_idx] = left_dims[i];
+        m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
+        if (dim_idx != i) {
+          m_lhs_inner_dim_contiguous = false;
+        }
+        if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) {
+          m_i_strides[nocontract_idx+1] = m_i_strides[nocontract_idx] * left_dims[i];
+        } else {
+          m_i_size = m_i_strides[nocontract_idx] * left_dims[i];
+        }
+        dim_idx++;
+        nocontract_idx++;
       }
-      stride *= left_dims[i];
     }
 
-    stride = 1;
-    skipped = 0;
+    nocontract_idx = 0;
     const typename TensorEvaluator<RightArgType, Device>::Dimensions& right_dims = m_rightImpl.dimensions();
-    for (int i = 0; i < TensorEvaluator<RightArgType, Device>::Dimensions::count; ++i) {
-      bool skip = false;
-      for (int j = 0; j < internal::array_size<Indices>::value; ++j) {
+    for (int i = 0; i < TensorEvaluator<RightArgType, Device>::Dimensions::count; i++) {
+      bool contracting = false;
+      // find if we are contracting on index i of right tensor
+      for (int j = 0; j < internal::array_size<Indices>::value; j++) {
         if (op.indices()[j].second == i) {
-          skip = true;
-          m_rightOffsets[2*skipped] = stride;
-          m_rightOffsets[2*skipped+1] = stride * right_dims[i];
+          contracting = true;
           break;
         }
       }
-      if (!skip) {
-        m_dimensions[index++] = right_dims[i];
+      if (!contracting) {
+        m_dimensions[dim_idx] = right_dims[i];
+        if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) {
+          m_j_strides[nocontract_idx+1] = m_j_strides[nocontract_idx] * right_dims[i];
+        } else {
+          m_j_size = m_j_strides[nocontract_idx] * right_dims[i];
+        }
+        m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
+        dim_idx++;
+        nocontract_idx++;
+      }
+    }
+
+    // Now compute the strides corresponding to the contracting dimensions. We
+    // assumed above that non-contracting axes are represented in the same order
+    // in the matrix as they are in the tensor. This is not the case for
+    // contracting axes. As the contracting axes must be of the same size in
+    // each tensor, we'll only look at the first tensor here.
+    m_rhs_inner_dim_contiguous = true;
+    m_rhs_inner_dim_reordered = false;
+    for (int i = 0; i < internal::array_size<Indices>::value; i++) {
+      Index left = op.indices()[i].first;
+      Index right = op.indices()[i].second;
+
+      Index size = left_dims[left];
+      eigen_assert(size == right_dims[right] && "Contraction axes must be same size");
+
+      if (i+1 < internal::array_size<contract_t>::value) {
+        m_k_strides[i+1] = m_k_strides[i] * size;
       } else {
-        ++skipped;
+        m_k_size = m_k_strides[i] * size;
+      }
+      m_left_contracting_strides[i] = lhs_strides[left];
+      m_right_contracting_strides[i] = rhs_strides[right];
+
+      if (i > 0 && right < op.indices()[i-1].second) {
+        m_rhs_inner_dim_reordered = true;
+      }
+      if (right != i) {
+        m_rhs_inner_dim_contiguous = false;
       }
-      stride *= right_dims[i];
     }
 
-    // Scalar case
+    // Scalar case. We represent the result as a 1d tensor of size 1.
     if (TensorEvaluator<LeftArgType, Device>::Dimensions::count + TensorEvaluator<RightArgType, Device>::Dimensions::count == 2 * internal::array_size<Indices>::value) {
       m_dimensions[0] = 1;
     }
   }
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-
-  const Dimensions& dimensions() const { return m_dimensions; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  void evalTo(typename XprType::Scalar* buffer) const {
-    for (int i = 0; i < dimensions().TotalSize(); ++i) {
-      buffer[i] += coeff(i);
-    }
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
-    return true;
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalTyped<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalTyped<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalTyped<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalTyped<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
   }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
+
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    const Index startLeft = index % m_shiftright;
-    const Index startRight = index / m_shiftright;
-    CoeffReturnType result = CoeffReturnType(0);
-    partialStitch(startLeft, startRight, 0, result);
-    return result;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_result[index];
   }
 
-  /* TODO: vectorization
   template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
-  {
-    assert(false);
-  }*/
-
- private:
-  EIGEN_DEVICE_FUNC void partialStitch(Index startLeft, Index startRight, int StitchIndex, CoeffReturnType& accum) const {
-    Index firstLeft = (startLeft / m_leftOffsets[2*StitchIndex]) * m_leftOffsets[2*StitchIndex+1] + (startLeft % m_leftOffsets[2*StitchIndex]);
-    Index firstRight = (startRight / m_rightOffsets[2*StitchIndex]) * m_rightOffsets[2*StitchIndex+1] + (startRight % m_rightOffsets[2*StitchIndex]);
-
-    for (int j = 0; j < m_stitchsize[StitchIndex]; ++j) {
-      const Index left = firstLeft+j*m_leftOffsets[2*StitchIndex];
-      const Index right = firstRight+j*m_rightOffsets[2*StitchIndex];
-      if (StitchIndex < internal::array_size<Indices>::value-1) {
-        partialStitch(left, right, StitchIndex+1, accum);
-      } else {
-        accum += m_leftImpl.coeff(left) * m_rightImpl.coeff(right);
-      }
-    }
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<Packet, LoadMode>(m_result + index);
   }
 
-  Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+
+  protected:
+  // Prevent assignment
+  TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
 
- private:
-  array<Index, 2*internal::array_size<Indices>::value> m_leftOffsets;
-  array<Index, 2*internal::array_size<Indices>::value> m_rightOffsets;
-  array<Index, internal::array_size<Indices>::value> m_stitchsize;
-  Index m_shiftright;
   Dimensions m_dimensions;
+
+  contract_t m_k_strides;
+  contract_t m_left_contracting_strides;
+  contract_t m_right_contracting_strides;
+
+  bool m_lhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_reordered;
+
+  left_nocontract_t m_i_strides;
+  right_nocontract_t m_j_strides;
+  left_nocontract_t m_left_nocontract_strides;
+  right_nocontract_t m_right_nocontract_strides;
+
+  Index m_i_size;
+  Index m_j_size;
+  Index m_k_size;
+
+  const Device& m_device;
+  Scalar* m_result;
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
   TensorEvaluator<RightArgType, Device> m_rightImpl;
 };
 
 
+template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Packet Packet;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  typedef array<Index, TensorEvaluator<LeftArgType, Device>::Dimensions::count> left_dim_mapper_t;
+  typedef array<Index, TensorEvaluator<RightArgType, Device>::Dimensions::count> right_dim_mapper_t;
+
+  typedef array<Index, internal::array_size<Indices>::value> contract_t;
+  typedef array<Index, max_n_1<TensorEvaluator<LeftArgType, Device>::Dimensions::count - internal::array_size<Indices>::value>::size> left_nocontract_t;
+  typedef array<Index, max_n_1<TensorEvaluator<RightArgType, Device>::Dimensions::count - internal::array_size<Indices>::value>::size> right_nocontract_t;
+
+  static const int NumDims = max_n_1<TensorEvaluator<LeftArgType, Device>::Dimensions::count + TensorEvaluator<RightArgType, Device>::Dimensions::count - 2 * internal::array_size<Indices>::value>::size;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) { }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    // define mr, nr, and all of my data mapper types
+    typedef typename internal::remove_const<typename LeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename RightArgType::Scalar>::type RhsScalar;
+    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+    const Index nr = Traits::nr;
+    const Index mr = Traits::mr;
+
+    typedef TensorEvaluator<LeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<RightArgType, Device> RightEvaluator;
+
+    const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+    // Declare GEBP packing and kernel structs
+    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
+    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
+    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
+
+    // Sizes of the blocks to load in cache. See the Goto paper for details.
+    BlockingType blocking(m, n, k, true);
+    const Index kc = blocking.kc();
+    const Index mc = (std::min)(m, blocking.mc());
+    const Index nc = (std::min)(n, blocking.nc());
+    int sizeA = mc * kc;
+    int sizeB = kc * nc;
+
+    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
+    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
+
+    for(Index i2=0; i2<m; i2+=mc)
+    {
+      const Index actual_mc = (std::min)(i2+mc,m)-i2;
+      for (Index k2 = 0; k2 < k; k2 += kc) {
+        // make sure we don't overshoot right edge of left matrix, then pack vertical panel
+        const Index actual_kc = (std::min)(k2 + kc, k) - k2;
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+
+        // series of horizontal blocks
+        for (Index j2 = 0; j2 < n; j2 += nc) {
+          // make sure we don't overshoot right edge of right matrix, then pack block
+          const Index actual_nc = (std::min)(j2 + nc, n) - j2;
+          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+
+          // call gebp (matrix kernel)
+          // The parameters here are copied from Eigen's GEMM implementation
+          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
+        }
+      }
+    }
+
+    this->m_device.deallocate(blockA);
+    this->m_device.deallocate(blockB);
+  }
+};
+
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 5a113dc19..11590b474 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -29,6 +29,13 @@ namespace Eigen {
   * \sa Tensor
   */
 
+// Can't use std::pairs on cuda devices
+template <typename Index> struct IndexPair {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
+  Index first;
+  Index second;
+};
 
 
 // Boiler plate code
-- 
cgit v1.2.3


From bfdd9f3ac95d9a2b41e6f2ec1f7434331125b9e1 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 15 Oct 2014 15:32:59 -0700
Subject: Made the blocking computation aware of the l3 cache Also optimized
 the blocking parameters to take into account the number of threads used for a
 computation

---
 Eigen/src/Core/SolveTriangular.h                   |   2 +-
 Eigen/src/Core/products/GeneralBlockPanelKernel.h  | 124 ++++++++++++++-------
 Eigen/src/Core/products/GeneralMatrixMatrix.h      |  16 +--
 .../Core/products/GeneralMatrixMatrixTriangular.h  |   2 +-
 Eigen/src/Core/products/Parallelizer.h             |   4 +-
 Eigen/src/Core/products/SelfadjointMatrixMatrix.h  |   6 +-
 Eigen/src/Core/products/TriangularMatrixMatrix.h   |   2 +-
 Eigen/src/Core/products/TriangularSolverMatrix.h   |   4 +-
 blas/level3_impl.h                                 |  12 +-
 test/product_large.cpp                             |   7 +-
 unsupported/Eigen/CXX11/Tensor                     |   2 +-
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     |   2 +-
 .../CXX11/src/Tensor/TensorContractionThreadPool.h |  13 +--
 13 files changed, 117 insertions(+), 79 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index ef17f288e..e158e3162 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -96,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
     typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
               Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
 
-    BlockingType blocking(rhs.rows(), rhs.cols(), size);
+    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
 
     triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
                                (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 090c8f4e6..b91786037 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
 }
 
 /** \internal */
-inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
 {
-  static std::ptrdiff_t m_l1CacheSize = 0;
-  static std::ptrdiff_t m_l2CacheSize = 0;
-  if(m_l2CacheSize==0)
+  static bool m_cache_sizes_initialized = false;
+  static std::ptrdiff_t m_l1CacheSize = 32*1024;
+  static std::ptrdiff_t m_l2CacheSize = 256*1024;
+  static std::ptrdiff_t m_l3CacheSize = 2*1024*1024;
+
+  if(!m_cache_sizes_initialized)
   {
-    m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
-    m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
+    int l1CacheSize, l2CacheSize, l3CacheSize;
+    queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
+    m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024);
+    m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024);
+    m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024);
+    m_cache_sizes_initialized = true;
   }
-  
+
   if(action==SetAction)
   {
     // set the cpu cache size and cache all block sizes from a global cache size in byte
     eigen_internal_assert(l1!=0 && l2!=0);
     m_l1CacheSize = *l1;
     m_l2CacheSize = *l2;
+    m_l3CacheSize = *l3;
   }
   else if(action==GetAction)
   {
     eigen_internal_assert(l1!=0 && l2!=0);
     *l1 = m_l1CacheSize;
     *l2 = m_l2CacheSize;
+    *l3 = m_l3CacheSize;
   }
   else
   {
@@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
   * - the number of scalars that fit into a packet (when vectorization is enabled).
   *
   * \sa setCpuCacheSizes */
+#define CEIL(a, b) ((a)+(b)-1)/(b)
+
 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
-void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
 {
-  EIGEN_UNUSED_VARIABLE(n);
   // Explanations:
   // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
   // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
@@ -81,43 +91,71 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
   // at the register level. For vectorization purpose, these small vertical panels are unpacked,
   // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
   // stay in L1 cache.
-  std::ptrdiff_t l1, l2;
-
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-  enum {
-    kdiv = KcFactor * 2 * Traits::nr
-         * Traits::RhsProgress * sizeof(RhsScalar),
-    mr = gebp_traits<LhsScalar,RhsScalar>::mr,
-    mr_mask = (0xffffffff/mr)*mr
-  };
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+
+  if (num_threads > 1) {
+    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+      k_mask = (0xffffffff/8)*8,
+
+      mr = Traits::mr,
+      mr_mask = (0xffffffff/mr)*mr,
+
+      nr = Traits::nr,
+      nr_mask = (0xffffffff/nr)*nr
+    };
+    SizeType k_cache = (l1-ksub)/kdiv;
+    if (k_cache < k) {
+      k = k_cache & k_mask;
+      eigen_assert(k > 0);
+    }
 
-  manage_caching_sizes(GetAction, &l1, &l2);
+    SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+    SizeType n_per_thread = CEIL(n, num_threads);
+    if (n_cache <= n_per_thread) {
+      // Don't exceed the capacity of the l2 cache.
+      eigen_assert(n_cache >= static_cast<SizeType>(nr));
+      n = n_cache & nr_mask;
+      eigen_assert(n > 0);
+    } else {
+      n = (std::min<SizeType>)(n, (n_per_thread + nr - 1) & nr_mask);
+    }
 
-//   k = std::min<SizeType>(k, l1/kdiv);
-//   SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
-//   if(_m<m) m = _m & mr_mask;
-  
-  // In unit tests we do not want to use extra large matrices,
-  // so we reduce the block size to check the blocking strategy is not flawed
+    if (l3 > l2) {
+      // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+      SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+      SizeType m_per_thread = CEIL(m, num_threads);
+      if(m_cache < m_per_thread && m_cache >= static_cast<SizeType>(mr)) {
+        m = m_cache & mr_mask;
+        eigen_assert(m > 0);
+      } else {
+        m = (std::min<SizeType>)(m, (m_per_thread + mr - 1) & mr_mask);
+      }
+    }
+  }
+  else {
+    // In unit tests we do not want to use extra large matrices,
+    // so we reduce the block size to check the blocking strategy is not flawed
 #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-//   k = std::min<SizeType>(k,240);
-//   n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
-//   m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
-  
-  k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
-  n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
-  m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
+    k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
+    n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
+    m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
 #else
-  k = std::min<SizeType>(k,24);
-  n = std::min<SizeType>(n,384/sizeof(RhsScalar));
-  m = std::min<SizeType>(m,384/sizeof(RhsScalar));
+    k = std::min<SizeType>(k,24);
+    n = std::min<SizeType>(n,384/sizeof(RhsScalar));
+    m = std::min<SizeType>(m,384/sizeof(RhsScalar));
 #endif
+  }
 }
 
 template<typename LhsScalar, typename RhsScalar, typename SizeType>
-inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
 {
-  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
+  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
 }
 
 #ifdef EIGEN_HAS_FUSE_CJMADD
@@ -1846,8 +1884,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Co
   * \sa setCpuCacheSize */
 inline std::ptrdiff_t l1CacheSize()
 {
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l1;
 }
 
@@ -1855,8 +1893,8 @@ inline std::ptrdiff_t l1CacheSize()
   * \sa setCpuCacheSize */
 inline std::ptrdiff_t l2CacheSize()
 {
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l2;
 }
 
@@ -1865,9 +1903,9 @@ inline std::ptrdiff_t l2CacheSize()
   * for the algorithms working per blocks.
   *
   * \sa computeProductBlockingSizes */
-inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
 {
-  internal::manage_caching_sizes(SetAction, &l1, &l2);
+  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 4d7a6270a..49362adbe 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -299,7 +299,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
 
   public:
 
-    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
+    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
     {
       this->m_mc = ActualRows;
       this->m_nc = ActualCols;
@@ -331,21 +331,21 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
 
   public:
 
-    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
+    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, int num_threads, bool l3_blocking)
     {
       this->m_mc = Transpose ? cols : rows;
       this->m_nc = Transpose ? rows : cols;
       this->m_kc = depth;
 
-      if(full_rows)
+      if(l3_blocking)
       {
-        DenseIndex m = this->m_mc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
       }
-      else // full columns
+      else  // no l3 blocking
       {
+        DenseIndex m = this->m_mc;
         DenseIndex n = this->m_nc;
-        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
       }
 
       m_sizeA = this->m_mc * this->m_kc;
@@ -451,7 +451,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
           (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
         _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
 
-      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
+      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
 
       internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
     }
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index daa8a1d8a..8de39f76f 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -72,7 +72,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
     Index kc = depth; // cache block size along the K direction
     Index mc = size;  // cache block size along the M direction
     Index nc = size;  // cache block size along the N direction
-    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
+    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1);
     // !!! mc must be a multiple of nr:
     if(mc > Traits::nr)
       mc = (mc/Traits::nr)*Traits::nr;
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 4079063eb..837e69415 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -49,8 +49,8 @@ inline void initParallel()
 {
   int nbt;
   internal::manage_multi_threading(GetAction, &nbt);
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
 }
 
 /** \returns the max number of threads reserved for Eigen
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index d9e6084c3..21f8175d2 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -343,7 +343,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     Index kc = size;  // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
     // kc must smaller than mc
     kc = (std::min)(kc,mc);
 
@@ -432,10 +432,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     LhsMapper lhs(_lhs,lhsStride);
     ResMapper res(_res,resStride);
 
-    Index kc = size; // cache block size along the K direction
+    Index kc = size;  // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
     std::size_t sizeB = kc*cols;
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
     ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 77aa3e5ee..4cbb79da0 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -412,7 +412,7 @@ struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
     Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
                                          : ((IsLower)  ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
 
-    BlockingType blocking(stripedRows, stripedCols, stripedDepth);
+    BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
 
     internal::product_triangular_matrix_matrix<Scalar, Index,
       Mode, LhsIsTriangular,
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 238d6fc02..f5de67c59 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -81,8 +81,8 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
 
     // the goal here is to subdivise the Rhs panels such that we keep some cache
     // coherence when accessing the rhs elements
-    std::ptrdiff_t l1, l2;
-    manage_caching_sizes(GetAction, &l1, &l2);
+    std::ptrdiff_t l1, l2, l3;
+    manage_caching_sizes(GetAction, &l1, &l2, &l3);
     Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
     subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
 
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index a05872666..37a803ced 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -56,7 +56,7 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
     else                matrix(c, *m, *n, *ldc) *= beta;
   }
 
-  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,true);
+  internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,1,true);
 
   int code = OP(*opa) | (OP(*opb) << 2);
   func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0);
@@ -131,12 +131,12 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
   
   if(SIDE(*side)==LEFT)
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
     func[code](*m, *n, a, *lda, b, *ldb, blocking);
   }
   else
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
     func[code](*n, *m, a, *lda, b, *ldb, blocking);
   }
 
@@ -222,12 +222,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
 
   if(SIDE(*side)==LEFT)
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
     func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking);
   }
   else
   {
-    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
+    internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
     func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking);
   }
   return 1;
@@ -577,7 +577,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
   else if(*n<0)                                                       info = 3;
   else if(*k<0)                                                       info = 4;
   else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 7;
-  else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 9;
+  else if(*ldb<std::max(1,(OP(*op)==NOTR)?*n:*k))                     info = 9;
   else if(*ldc<std::max(1,*n))                                        info = 12;
   if(info)
     return xerbla_(SCALAR_SUFFIX_UP"HER2K",&info,6);
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 11531aa1d..ffb8b7bf2 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -39,15 +39,16 @@ void test_product_large()
     // check the functions to setup blocking sizes compile and do not segfault
     // FIXME check they do what they are supposed to do !!
     std::ptrdiff_t l1 = internal::random<int>(10000,20000);
-    std::ptrdiff_t l2 = internal::random<int>(1000000,2000000);
-    setCpuCacheSizes(l1,l2);
+    std::ptrdiff_t l2 = internal::random<int>(100000,200000);
+    std::ptrdiff_t l3 = internal::random<int>(1000000,2000000);
+    setCpuCacheSizes(l1,l2,l3);
     VERIFY(l1==l1CacheSize());
     VERIFY(l2==l2CacheSize());
     std::ptrdiff_t k1 = internal::random<int>(10,100)*16;
     std::ptrdiff_t m1 = internal::random<int>(10,100)*16;
     std::ptrdiff_t n1 = internal::random<int>(10,100)*16;
     // only makes sure it compiles fine
-    internal::computeProductBlockingSizes<float,float>(k1,m1,n1);
+    internal::computeProductBlockingSizes<float,float>(k1,m1,n1,1);
   }
 
   {
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 7ec60044e..47447f446 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -55,7 +55,7 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
-//#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 1e6f276e0..cd992daab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -766,7 +766,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
 
     // Sizes of the blocks to load in cache. See the Goto paper for details.
-    BlockingType blocking(m, n, k, true);
+    BlockingType blocking(m, n, k, 1, true);
     const Index kc = blocking.kc();
     const Index mc = (std::min)(m, blocking.mc());
     const Index nc = (std::min)(n, blocking.nc());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index dc0513305..8e4c7c11d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -152,7 +152,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     Index mc = m;
     Index nc = n;
     Index kc = k;
-    internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc/*, num_threads*/);
+    internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads);
     eigen_assert(mc <= m);
     eigen_assert(nc <= n);
     eigen_assert(kc <= k);
@@ -197,9 +197,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
     // this should really be numBlockAs * n_blocks;
     const Index num_kernel_promises = num_threads * n_blocks;
-    Promise p;
-    p.set_value();
-    std::vector<Promise> kernel_promises(num_kernel_promises, p);
+    std::vector<Promise> kernel_promises(num_kernel_promises);
+    for (int i = 0; i < kernel_promises.size(); ++i) {
+      kernel_promises[i].set_value();
+    }
 
     for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
       const Index k_start = k_block_idx * kc;
@@ -275,8 +276,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
             need_to_pack, // need_to_pack
           };
 
-	  typedef decltype(Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>) Func;
-          this->m_device.enqueueNoFuture<Func, packRKArg>(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
+          this->m_device.enqueueNoFuture(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
         }
       }
     }
@@ -338,7 +338,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
              actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
 
         const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
-        eigen_assert(!(*arg.kernel_promises)[set_idx].ready());
         (*arg.kernel_promises)[set_idx].set_value();
       }
     }
-- 
cgit v1.2.3


From ae697b471c0d3961ebdb633e30046e5fe31fbe24 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 16 Oct 2014 14:52:50 -0700
Subject: Silenced a few compilation warnings Generalized a TensorMap
 constructor

---
 unsupported/Eigen/CXX11/src/Tensor/Tensor.h              |  3 ++-
 unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h  |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h      |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h |  4 ++--
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h   |  4 ++--
 unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h   |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorMap.h           |  3 ++-
 unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h      |  6 +++---
 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h       |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h         |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h     |  4 ++--
 unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h     |  2 +-
 unsupported/test/cxx11_tensor_fixed_size.cpp             | 10 +++++-----
 13 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 879057f38..ceed09505 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -1,6 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
 // Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -82,7 +83,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
 
     static const std::size_t NumIndices = NumIndices_;
 
-  typedef DSizes<DenseIndex, NumIndices_> Dimensions;
+    typedef DSizes<DenseIndex, NumIndices_> Dimensions;
 
   protected:
     TensorStorage<Scalar, NumIndices, Dynamic, Options> m_storage;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 0e55d4de1..2bd158dac 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -114,7 +114,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 9ecea9108..3aa3eba24 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -136,7 +136,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index b8e43f484..74485b15b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -140,7 +140,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
   {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
@@ -202,13 +202,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   Scalar* data() const { return NULL; }
 
   protected:
-    const Axis m_axis;
     Dimensions m_dimensions;
     array<Index, NumDims> m_outputStrides;
     array<Index, NumDims> m_leftStrides;
     array<Index, NumDims> m_rightStrides;
     TensorEvaluator<LeftArgType, Device> m_leftImpl;
     TensorEvaluator<RightArgType, Device> m_rightImpl;
+    const Axis m_axis;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index cd992daab..0db34adb1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -671,10 +671,10 @@ struct TensorContractionEvaluatorBase
   Index m_j_size;
   Index m_k_size;
 
-  const Device& m_device;
-  Scalar* m_result;
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
   TensorEvaluator<RightArgType, Device> m_rightImpl;
+  const Device& m_device;
+  Scalar* m_result;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 34bdd5309..50cb10a33 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -230,7 +230,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernel(NULL), m_kernelArg(op.kernelExpression()), m_local_kernel(false), m_device(device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
   {
     const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
     const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 2c0d2cd0f..0a8c10ac7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -64,7 +64,8 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
     }
 #endif
 
-    inline TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+    template <typename Dimensions>
+    inline TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 13109f514..686bf5c24 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -130,8 +130,8 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   Scalar* data() const { return m_impl.data(); }
 
  protected:
-  NewDimensions m_dimensions;
   TensorEvaluator<ArgType, Device> m_impl;
+  NewDimensions m_dimensions;
 };
 
 
@@ -381,13 +381,13 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     return inputIndex;
   }
 
-  Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
-  const StartIndices m_offsets;
   TensorEvaluator<ArgType, Device> m_impl;
   const Device& m_device;
+  Dimensions m_dimensions;
+  const StartIndices m_offsets;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 8da6e0f26..89c0cff05 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -215,11 +215,11 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     return rslt;
   }
 
-  PaddingDimensions m_padding;
   Dimensions m_dimensions;
   array<Index, NumDims+1> m_outputStrides;
   array<Index, NumDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
+  PaddingDimensions m_padding;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 01f2daf52..e2fe32d67 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -120,7 +120,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index eef992106..cbe87394b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -152,7 +152,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -217,8 +217,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   array<Index, NumDims> m_preservedStrides;
   array<Index, NumReducedDims> m_reducedStrides;
   array<Index, NumReducedDims> m_reducedDims;
-  Op m_reducer;
   TensorEvaluator<ArgType, Device> m_impl;
+  Op m_reducer;
 };
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 7e0063626..831a9f005 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -131,7 +131,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index b0501aaa3..99ffc7f07 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -32,10 +32,10 @@ static void test_1d()
   vec1(5) = 42.0; vec2(5) = 5.0;
 
   float data3[6];
-  TensorMap<TensorFixedSize<float, Sizes<6> > > vec3(data3, 6);
+  TensorMap<TensorFixedSize<float, Sizes<6> > > vec3(data3, Sizes<6>());
   vec3 = vec1.sqrt();
   float data4[6];
-  TensorMap<TensorFixedSize<float, Sizes<6>, RowMajor> > vec4(data4, 6);
+  TensorMap<TensorFixedSize<float, Sizes<6>, RowMajor> > vec4(data4, Sizes<6>());
   vec4 = vec2.sqrt();
 
   VERIFY_IS_EQUAL((vec3.size()), 6);
@@ -68,9 +68,9 @@ static void test_1d()
 static void test_2d()
 {
   float data1[6];
-  TensorMap<TensorFixedSize<float, Sizes<2, 3> >> mat1(data1,2,3);
+  TensorMap<TensorFixedSize<float, Sizes<2, 3> >> mat1(data1, Sizes<2, 3>());
   float data2[6];
-  TensorMap<TensorFixedSize<float, Sizes<2, 3>, RowMajor>> mat2(data2,2,3);
+  TensorMap<TensorFixedSize<float, Sizes<2, 3>, RowMajor>> mat2(data2, Sizes<2, 3>());
 
   VERIFY_IS_EQUAL((mat1.size()), 2*3);
   //  VERIFY_IS_EQUAL((mat1.dimension(0)), 2);
@@ -166,7 +166,7 @@ static void test_array()
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 7; ++k) {
-        mat1(array<ptrdiff_t, 3>(i,j,k)) = val;
+        mat1(array<ptrdiff_t, 3>{{i,j,k}}) = val;
         val += 1.0;
       }
     }
-- 
cgit v1.2.3


From 65af852b54afca3c76c978c1bfd27d8a1451cab6 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 16 Oct 2014 15:02:30 -0700
Subject: Silenced one last warning

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 0db34adb1..c530b27a7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -48,7 +48,7 @@ class BaseTensorContractionMapper {
       m_k_strides(k_strides) { }
 
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE void prefetch(int i) { }
+  EIGEN_STRONG_INLINE void prefetch(int /*i*/) { }
 
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
-- 
cgit v1.2.3


From b1789c112b5cf8d478a03786c6c1243320aefd47 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 3 Nov 2014 08:51:33 -0800
Subject: Improved handling of 1d tensors

---
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     | 98 +++++++++++++++++++---
 .../CXX11/src/Tensor/TensorContractionThreadPool.h | 12 ++-
 2 files changed, 99 insertions(+), 11 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index c530b27a7..8e898619d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -48,7 +48,7 @@ class BaseTensorContractionMapper {
       m_k_strides(k_strides) { }
 
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE void prefetch(int /*i*/) { }
+  EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
 
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
@@ -142,6 +142,13 @@ class BaseTensorContractionMapper {
     return IndexPair<Index>(linidx[0], linidx[1]);
   }
 
+  Index firstAligned(Index size) const {
+    return size;
+  }
+  Index stride() const {
+    return 1;
+  }
+
  protected:
   const Tensor m_tensor;
   const nocontract_t m_nocontract_strides;
@@ -202,6 +209,18 @@ class TensorContractionSubMapper {
     return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
   }
 
+  template <typename PacketT, int AlignmentType>
+  EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return loadPacket(i);
+  }
+
+  template <typename Packet>
+  bool aligned(Index /*i*/) const {
+    return false;
+  }
+
  private:
   const ParentMapper& m_base_mapper;
   const Index m_vert_offset;
@@ -220,6 +239,7 @@ class TensorContractionInputMapper
  public:
   typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> Base;
   typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef SubMapper VectorMapper;
 
   TensorContractionInputMapper(const Tensor& tensor,
                                const nocontract_t& nocontract_strides,
@@ -233,6 +253,10 @@ class TensorContractionInputMapper
     return SubMapper(*this, i, j);
   }
 
+  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename packet_traits<Scalar>::half HalfPacket;
 
@@ -306,6 +330,7 @@ class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, co
  public:
   typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> Base;
   typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef SubMapper VectorMapper;
 
   TensorContractionInputMapper(const Tensor& tensor,
                                const nocontract_t& nocontract_strides,
@@ -319,6 +344,10 @@ class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, co
     return SubMapper(*this, i, j);
   }
 
+  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+
   typedef typename packet_traits<Scalar>::type Packet;
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
@@ -592,41 +621,80 @@ struct TensorContractionEvaluatorBase
     if (this->m_lhs_inner_dim_contiguous) {
       if (this->m_rhs_inner_dim_contiguous) {
         if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalTyped<true, true, true, Unaligned>(buffer);
+          static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
         }
         else {
-          static_cast<const Derived*>(this)->template evalTyped<true, true, false, Unaligned>(buffer);
+          static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
         }
       }
       else {
        if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalTyped<true, false, true, Unaligned>(buffer);
+          static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
         }
         else {
-          static_cast<const Derived*>(this)->template evalTyped<true, false, false, Unaligned>(buffer);
+          static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
         }
       }
     }
     else {
       if (this->m_rhs_inner_dim_contiguous) {
         if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalTyped<false, true, true, Unaligned>(buffer);
+          static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
         }
         else {
-          static_cast<const Derived*>(this)->template evalTyped<false, true, false, Unaligned>(buffer);
+          static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
         }
       }
       else {
        if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalTyped<false, false, true, Unaligned>(buffer);
+          static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
         }
         else {
-          static_cast<const Derived*>(this)->template evalTyped<false, false, false, Unaligned>(buffer);
+          static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
         }
       }
     }
   }
 
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalGemv(Scalar* buffer) const {
+    const Index rows = m_i_size;
+    const Index cols = m_k_size;
+
+    typedef typename internal::remove_const<typename LeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename RightArgType::Scalar>::type RhsScalar;
+    typedef TensorEvaluator<LeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<RightArgType, Device> RightEvaluator;
+    const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
+                  m_left_contracting_strides, m_k_strides);
+    RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides,
+                  m_right_contracting_strides, m_k_strides);
+
+    const Scalar alpha(1);
+    const Index resIncr(1);
+
+    // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
+    m_device.memset(buffer, 0, rows * sizeof(Scalar));
+
+    internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
+        rows, cols, lhs, rhs,
+        buffer, resIncr, alpha);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
@@ -707,7 +775,17 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       Base(op, device) { }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalTyped(Scalar* buffer) const {
+  void evalProduct(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index cf1352a31..f0e9bb616 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -93,7 +93,17 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       Base(op, device) {}
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(Scalar* buffer) const {
+  void evalProduct(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalGemm(Scalar* buffer) const {
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
 
-- 
cgit v1.2.3


From 1d3c8306f87b284c26180be6eac13dc8d4aa1b52 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 13 Nov 2014 19:13:17 -0800
Subject: Fixed compilation errors with clang.

H: Enter commit message.  Lines beginning with 'HG:' are removed.
---
 unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h |  1 -
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h  |  4 ++--
 unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h   |  1 -
 unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h    | 10 +++++-----
 4 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index a77903dca..8cb41aec8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -30,7 +30,6 @@ struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
 };
 
 template<typename Broadcast, typename XprType>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 8e898619d..c5ec42cf4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -848,8 +848,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     const Index kc = blocking.kc();
     const Index mc = (std::min)(m, blocking.mc());
     const Index nc = (std::min)(n, blocking.nc());
-    int sizeA = mc * kc;
-    int sizeB = kc * nc;
+    const Index sizeA = mc * kc;
+    const Index sizeB = kc * nc;
 
     LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
     RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index ce916fdfd..0dfb6913b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -37,7 +37,6 @@ struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
   typedef typename XprTraits::Index Index;
   typedef typename XprType::Nested Nested;
   typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions + 1;
 };
 
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 010221e74..eaf0195ce 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -110,7 +110,7 @@ struct tuple_coeff<0> {
     update_value(std::get<0>(t), value);
   }
   template <typename... T>
-  static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) {
+  static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>&) {
     //    eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
     return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value & (i == 0);
   }
@@ -190,7 +190,7 @@ template <typename FirstType, typename... OtherTypes>
 struct index_statically_eq<IndexList<FirstType, OtherTypes...> > {
   constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        IndexList<FirstType, OtherTypes...>()[i] == value;
+        (IndexList<FirstType, OtherTypes...>()[i] == value);
   }
 };
 
@@ -198,7 +198,7 @@ template <typename FirstType, typename... OtherTypes>
 struct index_statically_eq<const IndexList<FirstType, OtherTypes...> > {
   constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        IndexList<FirstType, OtherTypes...>()[i] == value;
+        (IndexList<FirstType, OtherTypes...>()[i] == value);
   }
 };
 
@@ -213,7 +213,7 @@ template <typename FirstType, typename... OtherTypes>
 struct index_statically_ne<IndexList<FirstType, OtherTypes...> > {
   constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        IndexList<FirstType, OtherTypes...>()[i] != value;
+        (IndexList<FirstType, OtherTypes...>()[i] != value);
   }
 };
 
@@ -221,7 +221,7 @@ template <typename FirstType, typename... OtherTypes>
 struct index_statically_ne<const IndexList<FirstType, OtherTypes...> > {
   constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        IndexList<FirstType, OtherTypes...>()[i] != value;
+        (IndexList<FirstType, OtherTypes...>()[i] != value);
   }
 };
 
-- 
cgit v1.2.3


From 71676eaddd7fb6b8abdc5713f437750f3c963fcb Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 14 Jan 2015 12:36:57 -0800
Subject: Added support for RowMajor inputs to the contraction code.

---
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     | 265 +++++++++++++++------
 .../Eigen/CXX11/src/Tensor/TensorContractionCuda.h |   6 +-
 .../CXX11/src/Tensor/TensorContractionThreadPool.h |  43 +++-
 3 files changed, 220 insertions(+), 94 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index c5ec42cf4..a02a273e7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -320,6 +320,8 @@ class TensorContractionInputMapper
 };
 
 
+
+
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
@@ -362,6 +364,14 @@ class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, co
 };
 
 
+template <size_t n> struct max_n_1 {
+  static const size_t size = n;
+};
+template <> struct max_n_1<0> {
+  static const size_t size = 1;
+};
+
+
 template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
@@ -378,6 +388,10 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
   typedef typename remove_reference<LhsNested>::type _LhsNested;
   typedef typename remove_reference<RhsNested>::type _RhsNested;
 
+  // From NumDims below.
+  static const int NumDimensions = max_n_1<traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value>::size;
+  static const int Layout = traits<LhsXprType>::Layout;
+
   enum {
     Flags = 0,
   };
@@ -401,19 +415,19 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_,
   typedef LeftArgType_ LeftArgType;
   typedef RightArgType_ RightArgType;
   typedef Device_ Device;
+
+  // From NumDims below.
+  static const int NumDimensions = max_n_1<traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value>::size;
 };
 
 }  // end namespace internal
 
-
-
 template<typename Indices, typename LhsXprType, typename RhsXprType>
 class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType> >
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
   typedef typename Eigen::internal::traits<TensorContractionOp>::Packet Packet;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
                                                   typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
   typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
@@ -422,20 +436,21 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
   typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
+      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
       : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
 
-    EIGEN_DEVICE_FUNC
-    const Indices& indices() const { return m_indices; }
+  EIGEN_DEVICE_FUNC
+  const Indices& indices() const { return m_indices; }
 
-    /** \returns the nested expressions */
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename LhsXprType::Nested>::type&
-    lhsExpression() const { return m_lhs_xpr; }
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename LhsXprType::Nested>::type&
+  lhsExpression() const { return m_lhs_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename RhsXprType::Nested>::type&
-    rhsExpression() const { return m_rhs_xpr; }
+  EIGEN_DEVICE_FUNC
+  const typename internal::remove_all<typename RhsXprType::Nested>::type&
+  rhsExpression() const { return m_rhs_xpr; }
 
   protected:
     typename LhsXprType::Nested m_lhs_xpr;
@@ -444,12 +459,17 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
 };
 
 
-template <size_t n> struct max_n_1 {
-  static const size_t size = n;
-};
-template <> struct max_n_1<0> {
-  static const size_t size = 1;
-};
+template<bool cond> struct Cond {};
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T1& choose(Cond<true>, const T1& first, const T2&) {
+  return first;
+}
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T2& choose(Cond<false>, const T1&, const T2& second) {
+  return second;
+}
 
 
 template<typename Derived>
@@ -467,37 +487,94 @@ struct TensorContractionEvaluatorBase
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
-  typedef array<Index, TensorEvaluator<LeftArgType, Device>::Dimensions::count> left_dim_mapper_t;
-  typedef array<Index, TensorEvaluator<RightArgType, Device>::Dimensions::count> right_dim_mapper_t;
-
-  typedef array<Index, internal::array_size<Indices>::value> contract_t;
-  typedef array<Index, max_n_1<TensorEvaluator<LeftArgType, Device>::Dimensions::count - internal::array_size<Indices>::value>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<TensorEvaluator<RightArgType, Device>::Dimensions::count - internal::array_size<Indices>::value>::size> right_nocontract_t;
-
-  static const int NumDims = max_n_1<TensorEvaluator<LeftArgType, Device>::Dimensions::count + TensorEvaluator<RightArgType, Device>::Dimensions::count - 2 * internal::array_size<Indices>::value>::size;
-
-  typedef DSizes<Index, NumDims> Dimensions;
-
   enum {
     IsAligned = true,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device)
-      : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_device(device), m_result(NULL)
-  {
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  TensorContractionEvaluatorBase(const XprType& op, const Device& device)
+      : m_leftImpl(choose(Cond<Layout == ColMajor>(),
+                          op.lhsExpression(), op.rhsExpression()), device),
+        m_rightImpl(choose(Cond<Layout == ColMajor>(),
+                          op.rhsExpression(), op.lhsExpression()), device),
+        m_device(device),
+        m_result(NULL) {
+    EIGEN_STATIC_ASSERT((TensorEvaluator<LeftArgType, Device>::Layout ==
+                         TensorEvaluator<RightArgType, Device>::Layout),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
     eigen_assert((internal::array_size<contract_t>::value > 0) && "Must contract on some indices");
 
-    array<Index, TensorEvaluator<LeftArgType, Device>::Dimensions::count> lhs_strides;
+
+    DSizes<Index, LDims> eval_left_dims;
+    DSizes<Index, RDims> eval_right_dims;
+    array<IndexPair<Index>, ContractDims> eval_op_indices;
+    if (Layout == ColMajor) {
+      // For ColMajor, we keep using the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[i];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[i];
+      }
+      // We keep the pairs of contracting indices.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = op.indices()[i].first;
+        eval_op_indices[i].second = op.indices()[i].second;
+      }
+    } else {
+      // For RowMajor, we need to reverse the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
+      }
+      // We need to flip all the pairs of contracting indices as well as
+      // reversing the dimensions.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = LDims - 1 - op.indices()[i].second;
+        eval_op_indices[i].second = RDims - 1 - op.indices()[i].first;
+      }
+    }
+
+    array<Index, LDims> lhs_strides;
     lhs_strides[0] = 1;
-    for (int i = 0; i < TensorEvaluator<LeftArgType, Device>::Dimensions::count-1; ++i) {
-      lhs_strides[i+1] = lhs_strides[i] * m_leftImpl.dimensions()[i];
+    for (int i = 0; i < LDims-1; ++i) {
+      lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
     }
 
-    array<Index, TensorEvaluator<RightArgType, Device>::Dimensions::count> rhs_strides;
+    array<Index, RDims> rhs_strides;
     rhs_strides[0] = 1;
-    for (int i = 0; i < TensorEvaluator<RightArgType, Device>::Dimensions::count-1; ++i) {
-      rhs_strides[i+1] = rhs_strides[i] * m_rightImpl.dimensions()[i];
+    for (int i = 0; i < RDims-1; ++i) {
+      rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
     }
 
     m_i_strides[0] = 1;
@@ -515,27 +592,28 @@ struct TensorContractionEvaluatorBase
     m_lhs_inner_dim_contiguous = true;
     int dim_idx = 0;
     int nocontract_idx = 0;
-    const typename TensorEvaluator<LeftArgType, Device>::Dimensions& left_dims = m_leftImpl.dimensions();
-    for (int i = 0; i < TensorEvaluator<LeftArgType, Device>::Dimensions::count; i++) {
+
+    for (int i = 0; i < LDims; i++) {
       // find if we are contracting on index i of left tensor
       bool contracting = false;
-      for (int j = 0; j < internal::array_size<Indices>::value; j++) {
-        if (op.indices()[j].first == i) {
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].first == i) {
           contracting = true;
           break;
         }
       }
       if (!contracting) {
         // add dimension size to output dimensions
-        m_dimensions[dim_idx] = left_dims[i];
+        m_dimensions[dim_idx] = eval_left_dims[i];
         m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
         if (dim_idx != i) {
           m_lhs_inner_dim_contiguous = false;
         }
         if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) {
-          m_i_strides[nocontract_idx+1] = m_i_strides[nocontract_idx] * left_dims[i];
+          m_i_strides[nocontract_idx+1] =
+              m_i_strides[nocontract_idx] * eval_left_dims[i];
         } else {
-          m_i_size = m_i_strides[nocontract_idx] * left_dims[i];
+          m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
         }
         dim_idx++;
         nocontract_idx++;
@@ -543,22 +621,22 @@ struct TensorContractionEvaluatorBase
     }
 
     nocontract_idx = 0;
-    const typename TensorEvaluator<RightArgType, Device>::Dimensions& right_dims = m_rightImpl.dimensions();
-    for (int i = 0; i < TensorEvaluator<RightArgType, Device>::Dimensions::count; i++) {
+    for (int i = 0; i < RDims; i++) {
       bool contracting = false;
       // find if we are contracting on index i of right tensor
-      for (int j = 0; j < internal::array_size<Indices>::value; j++) {
-        if (op.indices()[j].second == i) {
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].second == i) {
           contracting = true;
           break;
         }
       }
       if (!contracting) {
-        m_dimensions[dim_idx] = right_dims[i];
+        m_dimensions[dim_idx] = eval_right_dims[i];
         if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) {
-          m_j_strides[nocontract_idx+1] = m_j_strides[nocontract_idx] * right_dims[i];
+          m_j_strides[nocontract_idx+1] =
+              m_j_strides[nocontract_idx] * eval_right_dims[i];
         } else {
-          m_j_size = m_j_strides[nocontract_idx] * right_dims[i];
+          m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
         }
         m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
         dim_idx++;
@@ -573,12 +651,13 @@ struct TensorContractionEvaluatorBase
     // each tensor, we'll only look at the first tensor here.
     m_rhs_inner_dim_contiguous = true;
     m_rhs_inner_dim_reordered = false;
-    for (int i = 0; i < internal::array_size<Indices>::value; i++) {
-      Index left = op.indices()[i].first;
-      Index right = op.indices()[i].second;
+    for (int i = 0; i < ContractDims; i++) {
+      Index left = eval_op_indices[i].first;
+      Index right = eval_op_indices[i].second;
 
-      Index size = left_dims[left];
-      eigen_assert(size == right_dims[right] && "Contraction axes must be same size");
+      Index size = eval_left_dims[left];
+      eigen_assert(size == eval_right_dims[right] &&
+                   "Contraction axes must be same size");
 
       if (i+1 < internal::array_size<contract_t>::value) {
         m_k_strides[i+1] = m_k_strides[i] * size;
@@ -588,7 +667,7 @@ struct TensorContractionEvaluatorBase
       m_left_contracting_strides[i] = lhs_strides[left];
       m_right_contracting_strides[i] = rhs_strides[right];
 
-      if (i > 0 && right < op.indices()[i-1].second) {
+      if (i > 0 && right < eval_op_indices[i-1].second) {
         m_rhs_inner_dim_reordered = true;
       }
       if (right != i) {
@@ -597,9 +676,16 @@ struct TensorContractionEvaluatorBase
     }
 
     // Scalar case. We represent the result as a 1d tensor of size 1.
-    if (TensorEvaluator<LeftArgType, Device>::Dimensions::count + TensorEvaluator<RightArgType, Device>::Dimensions::count == 2 * internal::array_size<Indices>::value) {
+    if (LDims + RDims == 2 * ContractDims) {
       m_dimensions[0] = 1;
     }
+
+    // If the layout is RowMajor, we need to reverse the m_dimensions
+    if (Layout == RowMajor) {
+      for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
+        std::swap(m_dimensions[i], m_dimensions[j]);
+      }
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -661,10 +747,10 @@ struct TensorContractionEvaluatorBase
     const Index rows = m_i_size;
     const Index cols = m_k_size;
 
-    typedef typename internal::remove_const<typename LeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename RightArgType::Scalar>::type RhsScalar;
-    typedef TensorEvaluator<LeftArgType, Device> LeftEvaluator;
-    typedef TensorEvaluator<RightArgType, Device> RightEvaluator;
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
     const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
     const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
     typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
@@ -719,7 +805,6 @@ struct TensorContractionEvaluatorBase
   protected:
   // Prevent assignment
   TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
-
   Dimensions m_dimensions;
 
   contract_t m_k_strides;
@@ -739,16 +824,18 @@ struct TensorContractionEvaluatorBase
   Index m_j_size;
   Index m_k_size;
 
-  TensorEvaluator<LeftArgType, Device> m_leftImpl;
-  TensorEvaluator<RightArgType, Device> m_rightImpl;
+  TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
+  TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
   const Device& m_device;
   Scalar* m_result;
 };
 
 
+// evaluator for default device
 template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
+    public TensorContractionEvaluatorBase<
+      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
   typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
@@ -759,15 +846,35 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
-  typedef array<Index, TensorEvaluator<LeftArgType, Device>::Dimensions::count> left_dim_mapper_t;
-  typedef array<Index, TensorEvaluator<RightArgType, Device>::Dimensions::count> right_dim_mapper_t;
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
 
-  typedef array<Index, internal::array_size<Indices>::value> contract_t;
-  typedef array<Index, max_n_1<TensorEvaluator<LeftArgType, Device>::Dimensions::count - internal::array_size<Indices>::value>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<TensorEvaluator<RightArgType, Device>::Dimensions::count - internal::array_size<Indices>::value>::size> right_nocontract_t;
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
 
-  static const int NumDims = max_n_1<TensorEvaluator<LeftArgType, Device>::Dimensions::count + TensorEvaluator<RightArgType, Device>::Dimensions::count - 2 * internal::array_size<Indices>::value>::size;
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
 
+  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
+
+  // Could we use NumDimensions here?
   typedef DSizes<Index, NumDims> Dimensions;
 
 
@@ -799,15 +906,15 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
 
     // define mr, nr, and all of my data mapper types
-    typedef typename internal::remove_const<typename LeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename RightArgType::Scalar>::type RhsScalar;
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
     typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
 
     const Index nr = Traits::nr;
     const Index mr = Traits::mr;
 
-    typedef TensorEvaluator<LeftArgType, Device> LeftEvaluator;
-    typedef TensorEvaluator<RightArgType, Device> RightEvaluator;
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
 
     const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
     const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
@@ -826,10 +933,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
     typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
 
-
     // Declare GEBP packing and kernel structs
     internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
     internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
+
     internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
 
     // initialize data mappers
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index f6bd949bd..588770bb4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -1241,10 +1241,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef array<Index, RDims> right_dim_mapper_t;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
 
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index f0e9bb616..5851e5adc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -70,24 +70,43 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
-  typedef array<Index, TensorEvaluator<LeftArgType, Device>::Dimensions::count> left_dim_mapper_t;
-  typedef array<Index, TensorEvaluator<RightArgType, Device>::Dimensions::count> right_dim_mapper_t;
-
-  typedef array<Index, internal::array_size<Indices>::value> contract_t;
-  typedef array<Index, max_n_1<TensorEvaluator<LeftArgType, Device>::Dimensions::count - internal::array_size<Indices>::value>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<TensorEvaluator<RightArgType, Device>::Dimensions::count - internal::array_size<Indices>::value>::size> right_nocontract_t;
-
-  static const int NumDims = max_n_1<TensorEvaluator<LeftArgType, Device>::Dimensions::count + TensorEvaluator<RightArgType, Device>::Dimensions::count - 2 * internal::array_size<Indices>::value>::size;
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
+  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+
+  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
   // typedefs needed in evalTo
-  typedef typename internal::remove_const<typename LeftArgType::Scalar>::type LhsScalar;
-  typedef typename internal::remove_const<typename RightArgType::Scalar>::type RhsScalar;
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
   typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
 
-  typedef TensorEvaluator<LeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<RightArgType, Device> RightEvaluator;
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
 
   TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) {}
-- 
cgit v1.2.3


From e896c0ade7c77a18acb1b3ef01f22ef698c1a2a2 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 29 Jan 2015 10:29:47 -0800
Subject: Marked the contraction operation as read only, since its result can't
 be assigned.

---
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index a02a273e7..af843654c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -423,7 +423,7 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_,
 }  // end namespace internal
 
 template<typename Indices, typename LhsXprType, typename RhsXprType>
-class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType> >
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
-- 
cgit v1.2.3


From 410895a7e4276fa2e1f78dbb953c7045818a86ae Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 10 Feb 2015 12:13:19 -0800
Subject: Silenced several compilation warnings

---
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h      |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h    | 16 ++++++++--------
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 18 +++++++++---------
 .../CXX11/src/Tensor/TensorContractionThreadPool.h     |  4 ++--
 unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h   |  4 ++--
 unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h  |  4 ++--
 6 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 93938bd1b..a4f73b2a1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -103,7 +103,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
-    EIGEN_STATIC_ASSERT((TensorEvaluator<LeftArgType, Device>::Layout == TensorEvaluator<RightArgType, Device>::Layout), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
     // The dimensions of the lhs and the rhs tensors should be equal to prevent
     // overflows and ensure the result is fully initialized.
     eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_leftImpl.dimensions()));
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 503803d23..698bcfe18 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -257,13 +257,13 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex;
-    if ((Layout == ColMajor && m_dim.actualDim() == 0) ||
-        (Layout == RowMajor && m_dim.actualDim() == NumInputDims-1)) {
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
+	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       inputIndex = index * m_inputStride + m_inputOffset;
-    } else if ((Layout == ColMajor && m_dim.actualDim() == NumInputDims-1) ||
-               (Layout == RowMajor && m_dim.actualDim() == 0)) {
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
+	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
       // m_stride is aways greater than index, so let's avoid the integer division.
       eigen_assert(m_stride > index);
       inputIndex = index + m_inputOffset;
@@ -322,8 +322,8 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
     static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
 
-    if ((this->Layout == ColMajor && this->m_dim.actualDim() == 0) ||
-        (this->Layout == RowMajor && this->m_dim.actualDim() == NumInputDims-1)) {
+    if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
+	(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(this->m_stride == 1);
       EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
@@ -333,8 +333,8 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
         this->m_impl.coeffRef(inputIndex) = values[i];
         inputIndex += this->m_inputStride;
       }
-    } else if ((this->Layout == ColMajor && this->m_dim.actualDim() == NumInputDims-1) ||
-               (this->Layout == RowMajor && this->m_dim.actualDim() == 0)) {
+    } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
+	       (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
       // m_stride is aways greater than index, so let's avoid the integer division.
       eigen_assert(this->m_stride > index);
       this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index af843654c..e750c21e7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -499,9 +499,9 @@ struct TensorContractionEvaluatorBase
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
   // will pretend B is LHS and A is RHS.
   typedef typename internal::conditional<
-    Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType;
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
   typedef typename internal::conditional<
-    Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType;
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
 
   static const int LDims =
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
@@ -520,14 +520,14 @@ struct TensorContractionEvaluatorBase
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   TensorContractionEvaluatorBase(const XprType& op, const Device& device)
-      : m_leftImpl(choose(Cond<Layout == ColMajor>(),
+    : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
                           op.lhsExpression(), op.rhsExpression()), device),
-        m_rightImpl(choose(Cond<Layout == ColMajor>(),
+    m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
                           op.rhsExpression(), op.lhsExpression()), device),
         m_device(device),
         m_result(NULL) {
-    EIGEN_STATIC_ASSERT((TensorEvaluator<LeftArgType, Device>::Layout ==
-                         TensorEvaluator<RightArgType, Device>::Layout),
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+			   static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     eigen_assert((internal::array_size<contract_t>::value > 0) && "Must contract on some indices");
@@ -681,7 +681,7 @@ struct TensorContractionEvaluatorBase
     }
 
     // If the layout is RowMajor, we need to reverse the m_dimensions
-    if (Layout == RowMajor) {
+    if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
       for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
         std::swap(m_dimensions[i], m_dimensions[j]);
       }
@@ -855,9 +855,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
   // will pretend B is LHS and A is RHS.
   typedef typename internal::conditional<
-    Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType;
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
   typedef typename internal::conditional<
-    Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType;
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
 
   static const int LDims =
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index e358e6a3a..8b87f1045 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -79,9 +79,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
   // will pretend B is LHS and A is RHS.
   typedef typename internal::conditional<
-    Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType;
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
   typedef typename internal::conditional<
-    Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType;
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
 
   static const int LDims =
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 97f225f0a..5e167d4aa 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -94,14 +94,14 @@ struct TensorEvaluator
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
     eigen_assert(m_data);
-    if (Layout == ColMajor) {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
     } else {
       return m_data[m_dims.IndexOfRowMajor(coords)];
     }
   }
 
-  Scalar* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
 
  protected:
   Scalar* m_data;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 7e448f7c0..c00810594 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -112,7 +112,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = (TensorEvaluator<ArgType, Device>::Layout == ColMajor) ? RowMajor : ColMajor,
+    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
   };
 
@@ -169,7 +169,7 @@ template<typename ArgType, typename Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = (TensorEvaluator<ArgType, Device>::Layout == ColMajor) ? RowMajor : ColMajor,
+    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
   };
 
-- 
cgit v1.2.3


From 114e863f086077fc949baf5dfe1f4102222c938e Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 10 Feb 2015 12:20:24 -0800
Subject: Silcenced a few compilation warnings

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h |  8 ++++----
 unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h  |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h    | 12 ++++++------
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h    | 14 +++++++-------
 4 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index ef134adf2..5790e19d6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       m_dimensions[i] = input_dims[i] * broadcast[i];
     }
 
-    if (Layout == ColMajor) {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       m_inputStrides[0] = 1;
       m_outputStrides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
@@ -139,7 +139,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
   {
-    if (Layout == ColMajor) {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return coeffColMajor(index);
     } else {
       return coeffRowMajor(index);
@@ -210,7 +210,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
   {
-    if (Layout == ColMajor) {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return packetColMajor<LoadMode>(index);
     } else {
       return packetRowMajor<LoadMode>(index);
@@ -326,7 +326,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   }
 
 
-  Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
   Dimensions m_dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index e750c21e7..f7254a24d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -536,7 +536,7 @@ struct TensorContractionEvaluatorBase
     DSizes<Index, LDims> eval_left_dims;
     DSizes<Index, RDims> eval_right_dims;
     array<IndexPair<Index>, ContractDims> eval_op_indices;
-    if (Layout == ColMajor) {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       // For ColMajor, we keep using the existing dimensions
       for (int i = 0; i < LDims; i++) {
         eval_left_dims[i] = m_leftImpl.dimensions()[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 5e167d4aa..488d32cb4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -167,7 +167,7 @@ struct TensorEvaluator<const Derived, Device>
 #endif
   }
 
-  const Scalar* data() const { return m_data; }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
 
  protected:
   const Scalar* m_data;
@@ -218,7 +218,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
     return m_functor.packetOp(index);
   }
 
-  CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
 
  private:
   const NullaryOp m_functor;
@@ -273,7 +273,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
     return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
   }
 
-  CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
 
  private:
   const UnaryOp m_functor;
@@ -301,7 +301,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
-    EIGEN_STATIC_ASSERT((TensorEvaluator<LeftArgType, Device>::Layout == TensorEvaluator<RightArgType, Device>::Layout || internal::traits<XprType>::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
     eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
   }
 
@@ -337,7 +337,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
   }
 
-  CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
 
  private:
   const BinaryOp m_functor;
@@ -413,7 +413,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
                             m_elseImpl.template packet<LoadMode>(index));
   }
 
-  CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
 
  private:
   TensorEvaluator<IfArgType, Device> m_condImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 21416afe0..7643d4cdc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -245,7 +245,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     }
 
     // Precompute output strides.
-    if (Layout == ColMajor) {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       m_outputStrides[0] = 1;
       for (int i = 1; i < NumOutputDims; ++i) {
         m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
@@ -259,7 +259,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
     // Precompute input strides.
     array<Index, NumInputDims> input_strides;
-    if (Layout == ColMajor) {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       input_strides[0] = 1;
       for (int i = 1; i < NumInputDims; ++i) {
         input_strides[i] = input_strides[i-1] * input_dims[i-1];
@@ -309,7 +309,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     Op reducer(m_reducer);
     if (ReducingInnerMostDims) {
       const Index num_values_to_reduce =
-          (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
+	(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
       return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
                                                              num_values_to_reduce, reducer);
     } else {
@@ -330,7 +330,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
     if (ReducingInnerMostDims) {
       const Index num_values_to_reduce =
-          (Layout == ColMajor) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
+	(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
       const Index firstIndex = firstInput(index);
       for (Index i = 0; i < packetSize; ++i) {
         Op reducer(m_reducer);
@@ -339,7 +339,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
       }
     } else if (PreservingInnerMostDims) {
       const Index firstIndex = firstInput(index);
-      const int innermost_dim = (Layout == ColMajor) ? 0 : NumOutputDims - 1;
+      const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
       // TBD: extend this the the n innermost dimensions that we preserve.
       if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
         Op reducer(m_reducer);
@@ -371,7 +371,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   // used to compute the reduction at output index "index".
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
     if (ReducingInnerMostDims) {
-      if (Layout == ColMajor) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         return index * m_preservedStrides[0];
       } else {
         return index * m_preservedStrides[NumOutputDims - 1];
@@ -379,7 +379,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     }
     // TBD: optimize the case where we preserve the innermost dimensions.
     Index startInput = 0;
-    if (Layout == ColMajor) {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumOutputDims - 1; i > 0; --i) {
         // This is index_i in the output tensor.
         const Index idx = index / m_outputStrides[i];
-- 
cgit v1.2.3