From 7402fea0a8e63e3ea248257047c584afee8f8bde Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 16 May 2014 15:08:05 -0700
Subject: Vectorized the evaluation of tensor expression (using SSE, AVX, NEON,
 ...) Added the ability to parallelize the evaluation of a tensor expression
 over multiple cpu cores. Added the ability to offload the evaluation of a
 tensor expression to a GPU.

---
 unsupported/Eigen/CXX11/src/Tensor/Tensor.h        |   8 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h  | 145 ++++++++++++++++++-
 unsupported/Eigen/CXX11/src/Tensor/TensorBase.h    |  12 ++
 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h  |  83 +++++++++++
 .../Eigen/CXX11/src/Tensor/TensorDeviceType.h      |  56 ++++++++
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h      |  14 +-
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       |  54 +++++--
 unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h    |  27 ++--
 .../Eigen/CXX11/src/Tensor/TensorFixedSize.h       |  10 +-
 .../CXX11/src/Tensor/TensorForwardDeclarations.h   |   4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorMap.h     | 158 +++++++++++++++++++--
 unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h |  19 ---
 12 files changed, 524 insertions(+), 66 deletions(-)
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
 create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h

(limited to 'unsupported/Eigen/CXX11/src/Tensor')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index f5c027d1c..d8ff3f584 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -75,9 +75,15 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
     typedef typename internal::traits<Self>::StorageKind StorageKind;
     typedef typename internal::traits<Self>::Index Index;
     typedef Scalar_ Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef typename Base::PacketReturnType PacketReturnType;
+
+    enum {
+      IsAligned = bool(EIGEN_ALIGN),
+      PacketAccess = true,
+    };
 
     static const int Options = Options_;
     static const std::size_t NumIndices = NumIndices_;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index f1df827f9..e69ff6188 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -10,6 +10,9 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
 #define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
 
+#ifdef EIGEN_USE_THREADS
+#include <future>
+#endif
 
 namespace Eigen {
 
@@ -28,7 +31,8 @@ namespace Eigen {
   */
 namespace internal {
 
-template<typename Derived1, typename Derived2>
+// Default strategy: the expressions are evaluated with a single cpu thread.
+template<typename Derived1, typename Derived2, bool Vectorizable = TensorEvaluator<Derived1>::PacketAccess & TensorEvaluator<Derived2>::PacketAccess>
 struct TensorAssign
 {
   typedef typename Derived1::Index Index;
@@ -38,13 +42,150 @@ struct TensorAssign
     TensorEvaluator<Derived1> evalDst(dst);
     TensorEvaluator<Derived2> evalSrc(src);
     const Index size = dst.size();
-    for(Index i = 0; i < size; ++i) {
+    for (Index i = 0; i < size; ++i) {
+      evalDst.coeffRef(i) = evalSrc.coeff(i);
+    }
+  }
+};
+
+
+template<typename Derived1, typename Derived2>
+struct TensorAssign<Derived1, Derived2, true>
+{
+  typedef typename Derived1::Index Index;
+  EIGEN_DEVICE_FUNC
+  static inline void run(Derived1& dst, const Derived2& src)
+  {
+    TensorEvaluator<Derived1> evalDst(dst);
+    TensorEvaluator<Derived2> evalSrc(src);
+    const Index size = dst.size();
+
+    static const int LhsStoreMode = TensorEvaluator<Derived1>::IsAligned ? Aligned : Unaligned;
+    static const int RhsLoadMode = TensorEvaluator<Derived2>::IsAligned ? Aligned : Unaligned;
+    static const int PacketSize = unpacket_traits<typename TensorEvaluator<Derived1>::PacketReturnType>::size;
+    static const int VectorizedSize = (size / PacketSize) * PacketSize;
+
+    for (Index i = 0; i < VectorizedSize; i += PacketSize) {
+      evalDst.template writePacket<LhsStoreMode>(i, evalSrc.template packet<RhsLoadMode>(i));
+    }
+    for (Index i = VectorizedSize; i < size; ++i) {
       evalDst.coeffRef(i) = evalSrc.coeff(i);
     }
   }
 };
 
 
+
+// Multicore strategy: the index space is partitioned and each core is assigned to a partition
+#ifdef EIGEN_USE_THREADS
+template <typename LhsEval, typename RhsEval, typename Index, bool Vectorizable = LhsEval::PacketAccess & RhsEval::PacketAccess>
+struct EvalRange {
+  static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) {
+    eigen_assert(last > first);
+    for (Index i = first; i < last; ++i) {
+      dst.coeffRef(i) = src.coeff(i);
+    }
+  }
+};
+
+template <typename LhsEval, typename RhsEval, typename Index>
+struct EvalRange<LhsEval, RhsEval, Index, true> {
+  static void run(LhsEval& dst, const RhsEval& src, const Index first, const Index last) {
+    eigen_assert(last > first);
+
+    Index i = first;
+    static const int PacketSize = unpacket_traits<typename LhsEval::PacketReturnType>::size;
+    if (last - first > PacketSize) {
+      static const int LhsStoreMode = LhsEval::IsAligned ? Aligned : Unaligned;
+      static const int RhsLoadMode = RhsEval::IsAligned ? Aligned : Unaligned;
+      eigen_assert(first % PacketSize == 0);
+      Index lastPacket = last - (last % PacketSize);
+      for (; i < lastPacket; i += PacketSize) {
+        dst.template writePacket<LhsStoreMode>(i, src.template packet<RhsLoadMode>(i));
+      }
+    }
+
+    for (; i < last; ++i) {
+      dst.coeffRef(i) = src.coeff(i);
+    }
+  }
+};
+
+template<typename Derived1, typename Derived2>
+struct TensorAssignMultiThreaded
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1& dst, const Derived2& src, const ThreadPoolDevice& device)
+  {
+    TensorEvaluator<Derived1> evalDst(dst);
+    TensorEvaluator<Derived2> evalSrc(src);
+    const Index size = dst.size();
+
+    static const bool Vectorizable = TensorEvaluator<Derived1>::PacketAccess & TensorEvaluator<Derived2>::PacketAccess;
+    static const int PacketSize = Vectorizable ? unpacket_traits<typename TensorEvaluator<Derived1>::PacketReturnType>::size : 1;
+
+    int blocksz = static_cast<int>(ceil(static_cast<float>(size)/device.numThreads()) + PacketSize - 1);
+    const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+    const Index numblocks = size / blocksize;
+
+    Index i = 0;
+    vector<std::future<void> > results;
+    results.reserve(numblocks);
+    for (int i = 0; i < numblocks; ++i) {
+      results.push_back(std::async(std::launch::async, &EvalRange<TensorEvaluator<Derived1>, TensorEvaluator<Derived2>, Index>::run, evalDst, evalSrc, i*blocksize, (i+1)*blocksize));
+    }
+
+    for (int i = 0; i < numblocks; ++i) {
+      results[i].get();
+    }
+
+    if (numblocks * blocksize < size) {
+      EvalRange<TensorEvaluator<Derived1>, TensorEvaluator<Derived2>, Index>::run(evalDst, evalSrc, numblocks * blocksize, size);
+    }
+  }
+};
+#endif
+
+
+// GPU: the evaluation of the expressions is offloaded to a GPU.
+#ifdef EIGEN_USE_GPU
+template <typename LhsEvaluator, typename RhsEvaluator>
+__global__ void EigenMetaKernelNoCheck(LhsEvaluator evalDst, const RhsEvaluator evalSrc) {
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+  evalDst.coeffRef(index) = evalSrc.coeff(index);
+}
+template <typename LhsEvaluator, typename RhsEvaluator>
+__global__ void EigenMetaKernelPeel(LhsEvaluator evalDst, const RhsEvaluator evalSrc, int peel_start_offset, int size) {
+  const int index = peel_start_offset + blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < size) {
+    evalDst.coeffRef(index) = evalSrc.coeff(index);
+  }
+}
+
+template<typename Derived1, typename Derived2>
+struct TensorAssignGpu
+{
+  typedef typename Derived1::Index Index;
+  static inline void run(Derived1& dst, const Derived2& src, const GpuDevice& device)
+  {
+    TensorEvaluator<Derived1> evalDst(dst);
+    TensorEvaluator<Derived2> evalSrc(src);
+    const Index size = dst.size();
+    const int block_size = std::min<int>(size, 32*32);
+    const int num_blocks = size / block_size;
+    EigenMetaKernelNoCheck<TensorEvaluator<Derived1>, TensorEvaluator<Derived2> > <<<num_blocks, block_size, 0, device.stream()>>>(evalDst, evalSrc);
+
+    const int remaining_items = size % block_size;
+    if (remaining_items > 0) {
+      const int peel_start_offset = num_blocks * block_size;
+      const int peel_block_size = std::min<int>(size, 32);
+      const int peel_num_blocks = (remaining_items + peel_block_size - 1) / peel_block_size;
+      EigenMetaKernelPeel<TensorEvaluator<Derived1>, TensorEvaluator<Derived2> > <<<peel_num_blocks, peel_block_size, 0, device.stream()>>>(evalDst, evalSrc, peel_start_offset, size);
+    }
+  }
+};
+#endif
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 9c7783aaf..fa1bd3498 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -28,6 +28,7 @@ class TensorBase
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::traits<Derived>::Index Index;
     typedef Scalar CoeffReturnType;
+    typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
 
     Derived& setZero() {
       return setConstant(Scalar(0));
@@ -83,6 +84,17 @@ class TensorBase
       return TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
     }
 
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>
+    operator-(const OtherDerived& other) const  {
+      return TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+    }
+
+    template <typename DeviceType>
+    TensorDevice<Derived, DeviceType> device(const DeviceType& device) {
+      return TensorDevice<Derived, DeviceType>(device, derived());
+    }
+
   protected:
     template <typename OtherDerived> friend class TensorBase;
     EIGEN_DEVICE_FUNC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
new file mode 100644
index 000000000..71890e187
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -0,0 +1,83 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+
+namespace Eigen {
+
+/** \class TensorDevice
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Pseudo expression providing an operator = that will evaluate its argument
+  * on the specified computing 'device' (GPU, thread pool, ...)
+  *
+  * Example:
+  *    C.device(EIGEN_GPU) = A + B;
+  *
+  * Todo: thread pools.
+  * Todo: operator +=, -=, *= and so on.
+  */
+
+template <typename ExpressionType, typename DeviceType> class TensorDevice {
+  public:
+    TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+      internal::TensorAssign<ExpressionType, const OtherDerived>::run(m_expression, other);
+      return *this;
+    }
+
+  protected:
+    const DeviceType& m_device;
+    ExpressionType& m_expression;
+};
+
+
+#ifdef EIGEN_USE_THREADS
+template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPoolDevice> {
+  public:
+    TensorDevice(const ThreadPoolDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+      internal::TensorAssignMultiThreaded<ExpressionType, const OtherDerived>::run(m_expression, other, m_device);
+      return *this;
+    }
+
+  protected:
+    const ThreadPoolDevice& m_device;
+    ExpressionType& m_expression;
+};
+#endif
+
+
+#ifdef EIGEN_USE_GPU
+template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
+{
+  public:
+    TensorDevice(const GpuDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+      internal::TensorAssignGpu<ExpressionType, const OtherDerived>::run(m_expression, other, m_device);
+      return *this;
+    }
+
+  protected:
+    const GpuDevice& m_device;
+    ExpressionType& m_expression;
+};
+#endif
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
new file mode 100644
index 000000000..ded6ca604
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -0,0 +1,56 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
+
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+};
+
+
+// Multiple cpu cores
+// We should really use a thread pool here but first we need to find a portable thread pool library.
+#ifdef EIGEN_USE_THREADS
+struct ThreadPoolDevice {
+  ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { }
+  size_t numThreads() const { return num_threads_; }
+  /*ThreadPool* threadPool() const { return pool_; }*/
+
+ private:
+  // todo: NUMA, ...
+  size_t num_threads_;
+  /*ThreadPool* pool_;*/
+};
+#endif
+
+
+// GPU offloading
+#ifdef EIGEN_USE_GPU
+struct GpuDevice {
+  // todo: support for multiple gpu;
+  GpuDevice() {
+    cudaStreamCreate(&stream_);
+  }
+  ~GpuDevice() {
+    cudaStreamDestroy(stream_);
+  }
+  const cudaStream_t& stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_;
+};
+#endif
+
+}  // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index bd3bd5aca..43e9d6550 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -79,16 +79,16 @@ struct Sizes : internal::numeric_list<std::size_t, Indices...> {
 
   Sizes() { }
   template <typename DenseIndex>
-  explicit Sizes(const array<DenseIndex, Base::count>&/* indices*/) {
+  explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
     // todo: add assertion
   }
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  explicit Sizes(std::initializer_list<std::size_t>/* l*/) {
+  explicit Sizes(std::initializer_list<std::size_t> /*l*/) {
     // todo: add assertion
   }
 #endif
 
-  template <typename T> Sizes& operator = (const T&/* other*/) {
+  template <typename T> Sizes& operator = (const T& /*other*/) {
     // add assertion failure if the size of other is different
     return *this;
   }
@@ -119,7 +119,7 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
   static const size_t count = Base::count;
   static const std::size_t total_size = internal::arg_prod<Base>::value;
 
-  static const size_t TotalSize() {
+  static size_t TotalSize() {
     return internal::arg_prod<Base>::value;
   }
 
@@ -181,14 +181,11 @@ template <typename DenseIndex, std::size_t NumDims>
 struct DSizes : array<DenseIndex, NumDims> {
   typedef array<DenseIndex, NumDims> Base;
 
-  size_t TotalSize() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const {
     return internal::array_prod(*static_cast<const Base*>(this));
   }
 
   DSizes() { }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
-  //  explicit DSizes(std::initializer_list<DenseIndex> l) : Base(l) { }
-#endif
   explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
 
   DSizes& operator = (const array<DenseIndex, NumDims>& other) {
@@ -203,7 +200,6 @@ struct DSizes : array<DenseIndex, NumDims> {
   size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
     return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
   }
-
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index b0dbca041..3ce924dc3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -29,32 +29,38 @@ struct TensorEvaluator
 {
   typedef typename Derived::Index Index;
   typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Scalar& CoeffReturnType;
+  typedef typename Derived::Packet Packet;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename Derived::Packet PacketReturnType;
+
+  enum {
+    IsAligned = Derived::IsAligned,
+    PacketAccess = Derived::PacketAccess,
+  };
 
   TensorEvaluator(Derived& m)
       : m_data(const_cast<Scalar*>(m.data()))
   { }
 
-  CoeffReturnType coeff(Index index) const {
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
     return m_data[index];
   }
 
-  Scalar& coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index) {
     return m_data[index];
   }
 
-  // to do: vectorized evaluation.
-  /*  template<int LoadMode>
+  template<int LoadMode>
   PacketReturnType packet(Index index) const
   {
-    return ploadt<PacketScalar, LoadMode>(m_data + index);
+    return internal::ploadt<Packet, LoadMode>(m_data + index);
   }
 
-  template<int StoreMode>
-  void writePacket(Index index, const PacketScalar& x)
+  template <int StoreMode>
+  void writePacket(Index index, const Packet& x)
   {
-  return pstoret<Scalar, PacketScalar, StoreMode>(const_cast<Scalar*>(m_data) + index, x);
-  }*/
+    return internal::pstoret<Scalar, Packet, StoreMode>(m_data + index, x);
+  }
 
  protected:
   Scalar* m_data;
@@ -70,6 +76,11 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType> >
 {
   typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
 
+  enum {
+    IsAligned = TensorEvaluator<ArgType>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
+  };
+
   TensorEvaluator(const XprType& op)
     : m_functor(op.functor()),
       m_argImpl(op.nestedExpression())
@@ -77,12 +88,19 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType> >
 
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_functor(m_argImpl.coeff(index));
   }
 
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
+  }
+
  private:
   const UnaryOp m_functor;
   TensorEvaluator<ArgType> m_argImpl;
@@ -96,6 +114,12 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
 {
   typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
 
+  enum {
+    IsAligned = TensorEvaluator<LeftArgType>::IsAligned & TensorEvaluator<RightArgType>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType>::PacketAccess & TensorEvaluator<RightArgType>::PacketAccess &
+                   internal::functor_traits<BinaryOp>::PacketAccess,
+  };
+
   TensorEvaluator(const XprType& op)
     : m_functor(op.functor()),
       m_leftImpl(op.lhsExpression()),
@@ -104,11 +128,17 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
 
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
 
-  CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
     return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
   }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
+  }
 
  private:
   const BinaryOp m_functor;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index aa875dc31..e32077f6e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -33,6 +33,9 @@ struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
   typedef typename result_of<
                      UnaryOp(typename XprType::Scalar)
                    >::type Scalar;
+  typedef typename result_of<
+                     UnaryOp(typename XprType::Packet)
+                   >::type Packet;
   typedef typename XprType::Nested XprTypeNested;
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
 };
@@ -57,14 +60,16 @@ template<typename UnaryOp, typename XprType>
 class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
-
-   inline TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Packet Packet;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename XprType::PacketReturnType PacketReturnType;
+    typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
     EIGEN_DEVICE_FUNC
@@ -92,6 +97,7 @@ struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
                        typename RhsXprType::Scalar
                      )
                    >::type Scalar;
+  typedef typename internal::packet_traits<Scalar>::type Packet;
   typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
                                            typename traits<RhsXprType>::StorageKind>::ret StorageKind;
   typedef typename promote_index_type<typename traits<LhsXprType>::Index,
@@ -123,14 +129,17 @@ class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsX
 {
   public:
   typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
+  typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Packet Packet;
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
   typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
                                                   typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
+                                                  typename RhsXprType::PacketReturnType>::ret PacketReturnType;
   typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
 
-  inline TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp())
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp())
       : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
 
     EIGEN_DEVICE_FUNC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 953880123..dcc7ccd65 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -33,11 +33,17 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     typedef typename internal::traits<Self>::StorageKind StorageKind;
     typedef typename internal::traits<Self>::Index Index;
     typedef Scalar_ Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
-  static const int Options = Options_;
+    static const int Options = Options_;
+
+    enum {
+      IsAligned = bool(EIGEN_ALIGN),
+      PacketAccess = true,
+    };
+
   typedef Dimensions_ Dimensions;
   static const std::size_t NumIndices = Dimensions::count;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index e8a2125c4..09b0fe66d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -14,12 +14,14 @@ namespace Eigen {
 
 template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0> class Tensor;
 template<typename Scalar_, typename Dimensions, int Options_ = 0> class TensorFixedSize;
-template<typename PlainObjectType> class TensorMap;
+template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
 template<typename Derived> class TensorBase;
 
 template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
 template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
 
+template<typename ExpressionType, typename DeviceType> class TensorDevice;
+
 // Move to internal?
 template<typename Derived> struct TensorEvaluator;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index bb0b39c5a..3fc9c5335 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -22,16 +22,16 @@ template<int InnerStrideAtCompileTime, int OuterStrideAtCompileTime> class Strid
   *
   */
 
-template<typename PlainObjectType> class TensorMap : public TensorBase<TensorMap<PlainObjectType> >
+template<typename PlainObjectType, int Options_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_> >
 {
   public:
-    typedef TensorMap<PlainObjectType> Self;
+    typedef TensorMap<PlainObjectType, Options_> Self;
     typedef typename PlainObjectType::Base Base;
     typedef typename Eigen::internal::nested<Self>::type Nested;
     typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
     typedef typename internal::traits<PlainObjectType>::Index Index;
     typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    typedef typename internal::packet_traits<Scalar>::type Packet;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
@@ -43,13 +43,12 @@ template<typename PlainObjectType> class TensorMap : public TensorBase<TensorMap
     typedef Scalar* PointerType;
     typedef PointerType PointerArgType;
 
-  // Fixed size plain object type only
-  /*  EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr) {
-      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
-  //EIGEN_STATIC_ASSERT(1 == PlainObjectType::NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  // todo: add assert to ensure we don't screw up here.
-  }*/
+    static const int Options = Options_;
+
+    enum {
+      IsAligned = bool(EIGEN_ALIGN) && ((int(Options_)&Aligned)==Aligned),
+      PacketAccess = true,
+    };
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array<DenseIndex, PlainObjectType::NumIndices>({{firstDimension}})) {
@@ -65,7 +64,7 @@ template<typename PlainObjectType> class TensorMap : public TensorBase<TensorMap
     }
 #endif
 
-  inline TensorMap(PointerArgType dataPtr, const array<Index, PlainObjectType::NumIndices>& dimensions)
+    inline TensorMap(PointerArgType dataPtr, const array<Index, PlainObjectType::NumIndices>& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
@@ -80,12 +79,97 @@ template<typename PlainObjectType> class TensorMap : public TensorBase<TensorMap
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, PlainObjectType::NumIndices>& indices) const
+    {
+      //      eigen_assert(checkIndexRange(indices));
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(indices);
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(indices);
+        return m_data[index];
+      }
+    }
+
+#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+    {
+      static_assert(sizeof...(otherIndices) + 1 == PlainObjectType::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, PlainObjectType::NumIndices>{{firstIndex, otherIndices...}});
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, PlainObjectType::NumIndices>{{firstIndex, otherIndices...}});
+        return m_data[index];
+      }
+    }
+#else
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_data[index];
     }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i1 + i0 * m_dimensions[0];
+        return m_data[index];
+      } else {
+        const Index index = i0 + i1 * m_dimensions[0];
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+         const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+         return m_data[index];
+      } else {
+         const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+        return m_data[index];
+      }
+    }
+#endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, PlainObjectType::NumIndices>& indices)
+    {
+      //      eigen_assert(checkIndexRange(indices));
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = m_dimensions.IndexOfRowMajor(indices);
+        return m_data[index];
+      } else {
+        const Index index = m_dimensions.IndexOfColMajor(indices);
+        return m_data[index];
+      }
+    }
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
@@ -100,8 +184,60 @@ template<typename PlainObjectType> class TensorMap : public TensorBase<TensorMap
         return m_data[index];
       }
     }
+#else
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    {
+      eigen_internal_assert(index >= 0 && index < size());
+      return m_data[index];
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    {
+       if (PlainObjectType::Options&RowMajor) {
+         const Index index = i1 + i0 * m_dimensions[0];
+        return m_data[index];
+      } else {
+        const Index index = i0 + i1 * m_dimensions[0];
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    {
+       if (PlainObjectType::Options&RowMajor) {
+         const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
+        return m_data[index];
+      } else {
+         const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+        return m_data[index];
+      }
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    {
+      if (PlainObjectType::Options&RowMajor) {
+        const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+        return m_data[index];
+      } else {
+        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+        return m_data[index];
+      }
+    }
 #endif
 
+
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     Self& operator=(const OtherDerived& other)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index efcb39559..64098343e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -72,9 +72,6 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, void>
     TensorStorage() { }
     TensorStorage(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>& other) : Base_(other) { }
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-  //    TensorStorage(TensorStorage<T, NumIndices_, Dynamic, Options_, void>&&) = default;
-#endif
     TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {}
     TensorStorage(DenseIndex size, const array<DenseIndex, NumIndices_>& dimensions) : Base_(size, dimensions) {}
 
@@ -111,22 +108,6 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
       return *this;
     }
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-  /*    TensorStorage(Self_&& other)
-      : m_data(std::move(other.m_data)), m_dimensions(std::move(other.m_dimensions))
-    {
-      other.m_data = nullptr;
-    }
-
-    Self_& operator=(Self_&& other)
-    {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_dimensions, other.m_dimensions);
-      return *this;
-      }*/
-#endif
-
     ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
     void swap(Self_& other)
     { std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); }
-- 
cgit v1.2.3