From 99d75235a9567865d2c070a2840d54c8a5ad0f43 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Mon, 13 Oct 2014 17:02:09 -0700
Subject: Misc improvements and cleanups

---
 unsupported/Eigen/CXX11/Tensor                     |   4 +
 .../Eigen/CXX11/src/Core/util/CXX11Workarounds.h   |   5 +
 .../Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h   | 101 ++++++++++++++++++++-
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h  |   4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorBase.h    |   8 +-
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    |   8 +-
 .../Eigen/CXX11/src/Tensor/TensorConvolution.h     |  12 ++-
 unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h  |  35 +++++++
 .../Eigen/CXX11/src/Tensor/TensorDeviceType.h      |  73 ++++++++-------
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h      |   2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h  |   2 +-
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       |  20 ++--
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        |  36 ++++++--
 .../Eigen/CXX11/src/Tensor/TensorFixedSize.h       |   4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h  |  26 +++---
 unsupported/Eigen/CXX11/src/Tensor/TensorMap.h     |  22 ++---
 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h       |   4 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h |   9 +-
 .../Eigen/CXX11/src/Tensor/TensorStriding.h        |  61 +++++++++----
 unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h  |  32 ++++---
 21 files changed, 342 insertions(+), 130 deletions(-)

(limited to 'unsupported/Eigen')
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 0dac95e45..2137f4276 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -30,6 +30,10 @@
 #include <cstring>
 #include <stdint.h>
 
+#ifdef EIGEN_USE_THREADS
+#include <future>
+#endif
+
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 #include <curand_kernel.h>
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
index 227522ecb..e30eb6ad8 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/CXX11Workarounds.h
@@ -66,6 +66,11 @@ template<std::size_t I, class T, std::size_t N> constexpr inline T&       array_
 template<std::size_t I, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
 template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
 
+template<std::size_t I, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
+
+
 #undef STD_GET_ARR_HACK
 
 template <typename T> struct array_size;
diff --git a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
index 4c6b95773..e45d0a3b1 100644
--- a/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/Core/util/EmulateCXX11Meta.h
@@ -48,7 +48,8 @@ template <typename T, size_t n> class array {
     values[2] = v3;
   }
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4) {
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
+                            const T& v4) {
     EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
     values[0] = v1;
     values[1] = v2;
@@ -56,7 +57,8 @@ template <typename T, size_t n> class array {
     values[3] = v4;
   }
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5) {
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5) {
     EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
     values[0] = v1;
     values[1] = v2;
@@ -64,6 +66,43 @@ template <typename T, size_t n> class array {
     values[3] = v4;
     values[4] = v5;
   }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6) {
+    EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+                            const T& v5, const T& v6, const T& v7) {
+    EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE array(
+      const T& v1, const T& v2, const T& v3, const T& v4,
+      const T& v5, const T& v6, const T& v7, const T& v8) {
+    EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+    values[7] = v8;
+  }
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
   array(std::initializer_list<T> l) {
@@ -93,9 +132,11 @@ template<typename T, typename Tail=empty_list> struct type_list {
 
 struct null_type { };
 
-template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type, typename T4 = null_type, typename T5 = null_type>
+template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
+         typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
+         typename T7 = null_type, typename T8 = null_type>
 struct make_type_list {
-  typedef typename make_type_list<T2, T3, T4, T5>::type tailresult;
+  typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
 
   typedef type_list<T1, tailresult> type;
 };
@@ -150,6 +191,23 @@ template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
   typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
 };
 
+template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V> >::type type;
+};
+
 
 template <std::size_t index, class NList> struct get;
 
@@ -174,6 +232,7 @@ template <> struct arg_prod<empty_list> {
   static const int value = 1;
 };
 
+
 template<int n, typename t>
 array<t, n> repeat(t v) {
   array<t, n> array;
@@ -190,6 +249,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_l
   return get<I, type_list<Head, Tail> >::value;
 }
 
+template <class NList>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) {
+  return arg_prod<NList>::value;
+};
+
 template<std::size_t n, typename t>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
   t prod = 1;
@@ -201,6 +265,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
   return 0;
 }
 
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  eigen_assert(a.size() > 0);
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+  return prod;
+}
+
 template<std::size_t I, class T, std::size_t N>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
   return a[I];
@@ -210,12 +282,31 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
   return a[I];
 }
 
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
+  return a[I];
+}
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
+  return a[I];
+}
 
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<array<T,N>& > {
+  static const size_t value = N;
+};
 template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<const array<T,N> > {
   static const size_t value = N;
 };
-
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const array<T,N>& > {
+  static const size_t value = N;
+};
 
 struct sum_op {
   template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 3bfe80c9e..e973c00d3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -131,8 +131,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
-    static const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
-    static const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
+    const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
+    const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
     m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
   }
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 27c10f64f..6018ecc66 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -30,6 +30,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     typedef Scalar CoeffReturnType;
     typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
 
+    // Dimensions
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return derived().dimensions()[n]; }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(derived().dimensions()); }
+
     // Nullary operators
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
@@ -187,7 +193,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     // Contractions.
-    typedef std::pair<Index, Index> DimensionPair;
+    typedef Eigen::IndexPair<Index> DimensionPair;
 
     template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 3b2a9c8b9..0e55d4de1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -48,7 +48,7 @@ struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorB
 
 
 template<typename Broadcast, typename XprType>
-class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, WriteAccessors>
+class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
@@ -91,7 +91,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
   };
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_impl(op.expression(), device)
   {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -141,7 +141,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
@@ -161,7 +161,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const D
     if (innermostLoc + packetSize <= m_impl.dimensions()[0]) {
       return m_impl.template packet<Unaligned>(inputIndex);
     } else {
-      EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
       values[0] = m_impl.coeff(inputIndex);
       for (int i = 1; i < packetSize; ++i) {
         values[i] = coeff(originalIndex+i);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 4a5fd9c79..34bdd5309 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -872,11 +872,19 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    assert(m_buf);
-    assert(index < m_dimensions.TotalSize());
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
     return m_buf[index];
   }
 
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+  {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+  }
+
  private:
   // No assignment (copies are needed by the kernels)
   TensorEvaluator& operator = (const TensorEvaluator&);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 75519c9f5..649bdb308 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -38,6 +38,18 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
+      internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const DeviceType& m_device;
     ExpressionType& m_expression;
@@ -58,6 +70,18 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPool
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
+      internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const ThreadPoolDevice& m_device;
     ExpressionType& m_expression;
@@ -79,6 +103,17 @@ template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
       return *this;
     }
 
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+      typedef typename OtherDerived::Scalar Scalar;
+      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+      Sum sum(m_expression, other);
+      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+      Assign assign(m_expression, sum);
+      internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
+      return *this;
+    }
+
   protected:
     const GpuDevice& m_device;
     ExpressionType m_expression;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
index fad342eab..5a6ff70e9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -37,23 +37,41 @@ struct DefaultDevice {
 // Multiple cpu cores
 // We should really use a thread pool here but first we need to find a portable thread pool library.
 #ifdef EIGEN_USE_THREADS
+
+typedef std::future<void> Future;
+
 struct ThreadPoolDevice {
-  ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : /*pool_(pool), */num_threads_(num_cores) { }
-  size_t numThreads() const { return num_threads_; }
+  ThreadPoolDevice(/*ThreadPool* pool, */size_t num_cores) : num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
   }
+
   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     internal::aligned_free(buffer);
   }
+
   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
     ::memcpy(dst, src, n);
   }
+
   EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
     ::memset(buffer, c, n);
   }
 
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    return num_threads_;
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const {
+    return std::async(std::launch::async, f, args...);
+  }
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const {
+    std::async(std::launch::async, f, args...);
+  }
+
  private:
   // todo: NUMA, ...
   size_t num_threads_;
@@ -63,41 +81,34 @@ struct ThreadPoolDevice {
 
 // GPU offloading
 #ifdef EIGEN_USE_GPU
-static int m_numMultiProcessors = 0;
-static int m_maxThreadsPerBlock = 0;
-static int m_maxThreadsPerMultiProcessor = 0;
+static cudaDeviceProp m_deviceProperties;
+static bool m_devicePropInitialized = false;
+
+static void initializeDeviceProp() {
+  if (!m_devicePropInitialized) {
+    assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess);
+    m_devicePropInitialized = true;
+  }
+}
 
 static inline int getNumCudaMultiProcessors() {
-  if (m_numMultiProcessors == 0) {
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
-    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
-    m_numMultiProcessors = deviceProp.multiProcessorCount;
-  }
-  return m_numMultiProcessors;
+  initializeDeviceProp();
+  return m_deviceProperties.multiProcessorCount;
 }
 static inline int maxCudaThreadsPerBlock() {
-  if (m_maxThreadsPerBlock == 0) {
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-    m_numMultiProcessors = deviceProp.multiProcessorCount;
-    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
-    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
-  }
-  return m_maxThreadsPerBlock;
+  initializeDeviceProp();
+  return m_deviceProperties.maxThreadsPerBlock;
 }
 static inline int maxCudaThreadsPerMultiProcessor() {
-  if (m_maxThreadsPerBlock == 0) {
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, 0);
-    m_numMultiProcessors = deviceProp.multiProcessorCount;
-    m_maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
-    m_maxThreadsPerMultiProcessor = deviceProp.maxThreadsPerMultiProcessor;
-  }
-  return m_maxThreadsPerMultiProcessor;
+  initializeDeviceProp();
+  return m_deviceProperties.maxThreadsPerMultiProcessor;
+}
+static inline int sharedMemPerBlock() {
+  initializeDeviceProp();
+  return m_deviceProperties.sharedMemPerBlock;
 }
 
+
 struct GpuDevice {
   // The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
   GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
@@ -141,8 +152,8 @@ struct GpuDevice {
 #endif
   }
 
-  EIGEN_STRONG_INLINE size_t numThreads() const {
-    // Fixme:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
     return 32;
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 732c6b344..2dd8e274b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -29,7 +29,7 @@ namespace Eigen {
   * \sa Tensor
   */
 
-// Can't use std::pairs on cuda devices
+// Can't use std::pair on cuda devices
 template <typename Index> struct IndexPair {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 587cbd5ca..ce9d73578 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -116,7 +116,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
     m_buffer[i] = m_impl.coeff(i);
   }
-  EIGEN_STRONG_INLINE void evalPacket(Index i) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
     internal::pstoret<Scalar, Packet, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 0f969036c..e324ba8d2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -65,13 +65,13 @@ struct TensorEvaluator
     return m_data[index];
   }
 
-  template<int LoadMode> EIGEN_STRONG_INLINE
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketReturnType packet(Index index) const
   {
     return internal::ploadt<Packet, LoadMode>(m_data + index);
   }
 
-  template <int StoreMode> EIGEN_STRONG_INLINE
+  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const Packet& x)
   {
     return internal::pstoret<Scalar, Packet, StoreMode>(m_data + index, x);
@@ -113,13 +113,17 @@ struct TensorEvaluator<const Derived, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     eigen_assert(m_data);
+#ifdef __CUDA_ARCH__
+    return __ldg(m_data+index);
+#else
     return m_data[index];
+#endif
   }
 
-  template<int LoadMode> EIGEN_STRONG_INLINE
+  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketReturnType packet(Index index) const
   {
-    return internal::ploadt<Packet, LoadMode>(m_data + index);
+    return internal::ploadt_ro<Packet, LoadMode>(m_data + index);
   }
 
   const Scalar* data() const { return m_data; }
@@ -166,7 +170,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   }
 
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(index);
   }
@@ -219,7 +223,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   }
 
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
   }
@@ -278,7 +282,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
   }
   template<int LoadMode>
-  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
   }
@@ -340,7 +344,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
   }
   template<int LoadMode>
-  PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
   {
     static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
     internal::Selector<PacketSize> select;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 10f5a5ee7..01fa04c64 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -10,10 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
 
-#ifdef EIGEN_USE_THREADS
-#include <future>
-#endif
-
 namespace Eigen {
 
 /** \class TensorExecutor
@@ -62,7 +58,7 @@ class TensorExecutor<Expression, DefaultDevice, true>
     {
       const Index size = array_prod(evaluator.dimensions());
       static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
-      const int VectorizedSize = (size / PacketSize) * PacketSize;
+      const Index VectorizedSize = (size / PacketSize) * PacketSize;
 
       for (Index i = 0; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
@@ -131,10 +127,10 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
       const Index numblocks = size / blocksize;
 
       Index i = 0;
-      std::vector<std::future<void> > results;
+      std::vector<Future> results;
       results.reserve(numblocks);
       for (int i = 0; i < numblocks; ++i) {
-         results.push_back(std::async(std::launch::async, &EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize));
+        results.push_back(device.enqueue(&EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize));
       }
 
       for (int i = 0; i < numblocks; ++i) {
@@ -154,11 +150,31 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 template <typename Evaluator>
-__global__ void EigenMetaKernel(Evaluator eval, unsigned int size) {
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel(Evaluator eval, unsigned int size) {
+
   const int first_index = blockIdx.x * blockDim.x + threadIdx.x;
   const int step_size = blockDim.x * gridDim.x;
-  for (int i = first_index; i < size; i += step_size) {
-    eval.evalScalar(i);
+
+  if (!Evaluator::PacketAccess || !Evaluator::IsAligned) {
+    // Use the scalar path
+    for (int i = first_index; i < size; i += step_size) {
+      eval.evalScalar(i);
+    }
+  }
+  else {
+    // Use the vector path
+    const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const int vectorized_step_size = step_size * PacketSize;
+    const int vectorized_size = (size / PacketSize) * PacketSize;
+    int i = first_index * PacketSize;
+    for ( ; i < vectorized_size; i += vectorized_step_size) {
+      eval.evalPacket(i);
+    }
+    for ( ; i < size; i += step_size) {
+      eval.evalScalar(i);
+    }
   }
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 4d7f9e1fd..a753c5a48 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -17,7 +17,7 @@ namespace Eigen {
   *
   * \brief The fixed sized version of the tensor class.
   *
-  * The fixes sized equivalent of 
+  * The fixed sized equivalent of
   * Eigen::Tensor<float, 3> t(3, 5, 7);
   * is
   * Eigen::TensorFixedSize<float, Size<3,5,7>> t;
@@ -41,7 +41,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
 
     enum {
       IsAligned = bool(EIGEN_ALIGN),
-      PacketAccess = true,
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
     };
 
   typedef Dimensions_ Dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index cf97031be..2714117ab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -31,30 +31,34 @@ namespace internal {
 template <typename T>
 struct TensorIntDivisor {
  public:
-  TensorIntDivisor() {
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
     multiplier = 0;
     shift1 = 0;
     shift2 = 0;
   }
 
   // Must have 1 <= divider <= 2^31-1
-  TensorIntDivisor(const T divider) {
-    static const int N = 32;
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
+    const int N = 32;
     eigen_assert(divider > 0);
     eigen_assert(divider <= (1<<(N-1)) - 1);
 
     // fast ln2
+#ifndef __CUDA_ARCH__
     const int leading_zeros = __builtin_clz(divider);
-    const int l = N - (leading_zeros+1);
-
-    multiplier = (static_cast<uint64_t>(1) << (N+l)) / divider - (static_cast<uint64_t>(1) << N) + 1;
-    shift1 = (std::min)(1, l);
-    shift2 = (std::max)(0, l-1);
+#else
+    const int leading_zeros = __clz(divider);
+#endif
+    const int log_div = N - (leading_zeros+1);
+
+    multiplier = (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
+    shift1 = log_div > 1 ? 1 : log_div;
+    shift2 = log_div > 1 ? log_div-1 : 0;
   }
 
   // Must have 0 <= numerator <= 2^32-1
-  T divide(const T numerator) const {
-    static const int N = 32;
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
+    const int N = 32;
     eigen_assert(numerator >= 0);
     eigen_assert(numerator <= (1ull<<N) - 1);
 
@@ -71,7 +75,7 @@ struct TensorIntDivisor {
 
 
 template <typename T>
-static T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
   return divisor.divide(numerator);
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 04849dd9f..2c0d2cd0f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -42,26 +42,25 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
 
     static const int Options = Options_;
 
-    static const std::size_t NumIndices = PlainObjectType::NumIndices;
+    static const Index NumIndices = PlainObjectType::NumIndices;
     typedef typename PlainObjectType::Dimensions Dimensions;
 
-
     enum {
-      IsAligned = bool(EIGEN_ALIGN) && ((int(Options_)&Aligned)==Aligned),
-      PacketAccess = true,
+      IsAligned = ((int(Options_)&Aligned)==Aligned),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
     };
 
 #ifdef EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(array<DenseIndex, NumIndices>({{firstDimension, otherDimensions...}})) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(array<DenseIndex, NumIndices>(firstDimension)) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #endif
 
@@ -176,12 +175,13 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
     {
-      static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+      const std::size_t NumDims = sizeof...(otherIndices) + 1;
       if (PlainObjectType::Options&RowMajor) {
-        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
         return m_data[index];
       } else {
-        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
         return m_data[index];
       }
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 7da89458f..8da6e0f26 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -144,7 +144,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
@@ -206,7 +206,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
     for (int i = 0; i < packetSize; ++i) {
       values[i] = coeff(index+i);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index f7e7fc107..7e0063626 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -97,7 +97,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
 
   enum {
-    IsAligned = true,
+    IsAligned = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
   };
 
@@ -194,7 +194,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
 
   enum {
-    IsAligned = true,
+    IsAligned = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
   };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 0c4f8a3d6..aaec39756 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -30,11 +30,11 @@ namespace Eigen {
   *
   * \sa Tensor
   */
-template<typename T, std::size_t NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
+template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
 
 
 // Pure fixed-size storage
-template<typename T, std::size_t NumIndices_, DenseIndex Size, int Options_, typename FixedDimensions>
+template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename FixedDimensions>
 class TensorStorage
 {
  private:
@@ -62,7 +62,7 @@ class TensorStorage
 
 
 // pure-dynamic, but without specification of all dimensions explicitly
-template<typename T, std::size_t NumIndices_, int Options_>
+template<typename T, DenseIndex NumIndices_, int Options_>
 class TensorStorage<T, NumIndices_, Dynamic, Options_, void>
   : public TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
 {
@@ -79,7 +79,7 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, void>
 };
 
 // pure dynamic
-template<typename T, std::size_t NumIndices_, int Options_>
+template<typename T, DenseIndex NumIndices_, int Options_>
 class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
 {
     T *m_data;
@@ -140,6 +140,7 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
 };
 
 
+
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 7acdbfc72..ecfdb762c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -48,7 +48,7 @@ struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridin
 
 
 template<typename Strides, typename XprType>
-class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType>, WriteAccessors>
+class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
 {
   public:
   typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
@@ -97,7 +97,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -109,28 +109,23 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    for (int i = 0; i < NumDims; ++i) {
-      if (i > 0) {
-        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
-        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
-      } else {
-        m_inputStrides[0] = 1;
-        m_outputStrides[0] = 1;
-      }
-    }
-    for (int i = 0; i < NumDims; ++i) {
-        m_inputStrides[i] *= op.strides()[i];
+    m_outputStrides[0] = 1;
+    m_inputStrides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+      m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+      m_inputStrides[i-1] *= op.strides()[i-1];
     }
+    m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
   }
 
-  //  typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketReturnType PacketReturnType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -150,16 +145,44 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     return m_impl.coeff(inputIndex);
   }
 
-  /*  template<int LoadMode>
+  template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    return m_impl.template packet<LoadMode>(index);
-    }*/
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx0 = indices[0] / m_outputStrides[i];
+      const Index idx1 = indices[1] / m_outputStrides[i];
+      inputIndices[0] += idx0 * m_inputStrides[i];
+      inputIndices[1] += idx1 * m_inputStrides[i];
+      indices[0] -= idx0 * m_outputStrides[i];
+      indices[1] -= idx1 * m_outputStrides[i];
+    }
+    inputIndices[0] += indices[0] * m_inputStrides[0];
+    inputIndices[1] += indices[1] * m_inputStrides[0];
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    }
+    else {
+      EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      for (int i = 1; i < packetSize-1; ++i) {
+        values[i] = coeff(index+i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
 
   Scalar* data() const { return NULL; }
 
  protected:
-    //  Strides m_strides;
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
   array<Index, NumDims> m_inputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 40f805741..5940a8cf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -70,14 +70,18 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> >
 };
 
 
-template<typename PlainObjectType>
-struct traits<TensorMap<PlainObjectType> >
+template<typename PlainObjectType, int Options_>
+struct traits<TensorMap<PlainObjectType, Options_> >
   : public traits<PlainObjectType>
 {
   typedef traits<PlainObjectType> BaseTraits;
   typedef typename BaseTraits::Scalar Scalar;
   typedef typename BaseTraits::StorageKind StorageKind;
   typedef typename BaseTraits::Index Index;
+  enum {
+    Options = Options_,
+    Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
+  };
 };
 
 
@@ -105,16 +109,16 @@ struct eval<const TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense>
   typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
 };
 
-template<typename PlainObjectType>
-struct eval<TensorMap<PlainObjectType>, Eigen::Dense>
+template<typename PlainObjectType, int Options>
+struct eval<TensorMap<PlainObjectType, Options>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
-template<typename PlainObjectType>
-struct eval<const TensorMap<PlainObjectType>, Eigen::Dense>
+template<typename PlainObjectType, int Options>
+struct eval<const TensorMap<PlainObjectType, Options>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
 template <typename Scalar_, std::size_t NumIndices_, int Options_>
@@ -141,16 +145,16 @@ struct nested<const TensorFixedSize<Scalar_, Dimensions, Options>, 1, typename e
   typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
 };
 
-template <typename PlainObjectType>
-struct nested<TensorMap<PlainObjectType>, 1, typename eval<TensorMap<PlainObjectType> >::type>
+template <typename PlainObjectType, int Options>
+struct nested<TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
-template <typename PlainObjectType>
-struct nested<const TensorMap<PlainObjectType>, 1, typename eval<TensorMap<PlainObjectType> >::type>
+template <typename PlainObjectType, int Options>
+struct nested<const TensorMap<PlainObjectType, Options>, 1, typename eval<TensorMap<PlainObjectType, Options> >::type>
 {
-  typedef const TensorMap<PlainObjectType>& type;
+  typedef const TensorMap<PlainObjectType, Options>& type;
 };
 
 }  // end namespace internal
-- 
cgit v1.2.3