From 6913221c43c6ad41b1fbfc0d263d2764abd11ad2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 25 Jul 2018 13:51:10 -0700
Subject: Add tiled evaluation support to TensorExecutor

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 278689915..7ff0d323b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -1,4 +1,5 @@
 // This file is part of Eigen, a lightweight C++ template library
+// This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
@@ -110,6 +111,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   enum {
     IsAligned = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
-- 
cgit v1.2.3


From 966c2a7bb62a8b5b9ecd349730ffcd3b5719837d Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 27 Jul 2018 12:45:17 -0700
Subject: Rename Index to StorageIndex + use Eigen::Array and Eigen::Map when
 possible

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h   | 418 ++++++++-------------
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    |   1 -
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       |   9 +-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        | 185 ++++-----
 unsupported/test/cxx11_tensor_block_access.cpp     | 292 ++++++++------
 unsupported/test/cxx11_tensor_executor.cpp         |  20 +-
 6 files changed, 447 insertions(+), 478 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 5321acecf..84cf6d216 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -67,21 +67,21 @@ enum class TensorBlockShapeType {
 
 struct TensorOpResourceRequirements {
   TensorBlockShapeType block_shape;
-  std::size_t block_total_size;
+  Index block_total_size;
   // TODO(andydavis) Add 'target_num_threads' to support communication of
   // thread-resource requirements. This will allow ops deep in the
   // expression tree (like reductions) to communicate resources
   // requirements based on local state (like the total number of reductions
   // to be computed).
   TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
-                               const std::size_t size)
+                               const Index size)
       : block_shape(shape), block_total_size(size) {}
 };
 
 // Tries to merge multiple resource requirements.
 EIGEN_STRONG_INLINE void MergeResourceRequirements(
     const std::vector<TensorOpResourceRequirements>& resources,
-    TensorBlockShapeType* block_shape, std::size_t* block_total_size) {
+    TensorBlockShapeType* block_shape, Index* block_total_size) {
   if (resources.empty()) {
     return;
   }
@@ -108,12 +108,12 @@ EIGEN_STRONG_INLINE void MergeResourceRequirements(
  * This class represents a tensor block specified by the index of the
  * first block coefficient, and the size of the block in each dimension.
  */
-template <typename Scalar, typename Index, int NumDims, int Layout>
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
 class TensorBlock {
  public:
-  typedef DSizes<Index, NumDims> Dimensions;
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
 
-  TensorBlock(const Index first_coeff_index, const Dimensions& block_sizes,
+  TensorBlock(const StorageIndex first_coeff_index, const Dimensions& block_sizes,
               const Dimensions& block_strides, const Dimensions& tensor_strides,
               Scalar* data)
       : m_first_coeff_index(first_coeff_index),
@@ -122,7 +122,7 @@ class TensorBlock {
         m_tensor_strides(tensor_strides),
         m_data(data) {}
 
-  Index first_coeff_index() const { return m_first_coeff_index; }
+  StorageIndex first_coeff_index() const { return m_first_coeff_index; }
 
   const Dimensions& block_sizes() const { return m_block_sizes; }
 
@@ -135,108 +135,33 @@ class TensorBlock {
   const Scalar* data() const { return m_data; }
 
  private:
-  Index m_first_coeff_index;
+  StorageIndex m_first_coeff_index;
   Dimensions m_block_sizes;
   Dimensions m_block_strides;
   Dimensions m_tensor_strides;
   Scalar* m_data;  // Not owned.
 };
 
-template <typename Scalar, typename Index, bool Vectorizable>
+template <typename Scalar, typename StorageIndex>
 struct TensorBlockCopyOp {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Index num_coeff_to_copy, const Index dst_index,
-      const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data,
-      const Index src_index, const Index src_stride,
+      const StorageIndex num_coeff_to_copy, const StorageIndex dst_index,
+      const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+      const StorageIndex src_index, const StorageIndex src_stride,
       const Scalar* EIGEN_RESTRICT src_data) {
-    for (Index i = 0; i < num_coeff_to_copy; ++i) {
-      dst_data[dst_index + i * dst_stride] =
-          src_data[src_index + i * src_stride];
-    }
-  }
-};
+    const Scalar* src_base = &src_data[src_index];
+    Scalar* dst_base = &dst_data[dst_index];
 
-// NOTE: Benchmarks run on an implementation of this that broke each of the
-// loops in these conditionals into it's own template specialization (to
-// avoid conditionals in the caller's loop) did not show an improvement.
-template <typename Scalar, typename Index>
-struct TensorBlockCopyOp<Scalar, Index, true> {
-  typedef typename packet_traits<Scalar>::type Packet;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Index num_coeff_to_copy, const Index dst_index,
-      const Index dst_stride, Scalar* EIGEN_RESTRICT dst_data,
-      const Index src_index, const Index src_stride,
-      const Scalar* EIGEN_RESTRICT src_data) {
-    if (src_stride == 1) {
-      const Index packet_size = internal::unpacket_traits<Packet>::size;
-      const Index vectorized_size =
-          (num_coeff_to_copy / packet_size) * packet_size;
-      if (dst_stride == 1) {
-        // LINEAR
-        for (Index i = 0; i < vectorized_size; i += packet_size) {
-          Packet p = internal::ploadu<Packet>(src_data + src_index + i);
-          internal::pstoreu<Scalar, Packet>(dst_data + dst_index + i, p);
-        }
-        for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst_data[dst_index + i] = src_data[src_index + i];
-        }
-      } else {
-        // SCATTER
-        for (Index i = 0; i < vectorized_size; i += packet_size) {
-          Packet p = internal::ploadu<Packet>(src_data + src_index + i);
-          internal::pscatter<Scalar, Packet>(
-              dst_data + dst_index + i * dst_stride, p, dst_stride);
-        }
-        for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst_data[dst_index + i * dst_stride] = src_data[src_index + i];
-        }
-      }
-    } else if (src_stride == 0) {
-      const Index packet_size = internal::unpacket_traits<Packet>::size;
-      const Index vectorized_size =
-          (num_coeff_to_copy / packet_size) * packet_size;
-      if (dst_stride == 1) {
-        // LINEAR
-        for (Index i = 0; i < vectorized_size; i += packet_size) {
-          Packet p = internal::pload1<Packet>(src_data + src_index);
-          internal::pstoreu<Scalar, Packet>(dst_data + dst_index + i, p);
-        }
-        for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst_data[dst_index + i] = src_data[src_index];
-        }
-      } else {
-        // SCATTER
-        for (Index i = 0; i < vectorized_size; i += packet_size) {
-          Packet p = internal::pload1<Packet>(src_data + src_index);
-          internal::pscatter<Scalar, Packet>(
-              dst_data + dst_index + i * dst_stride, p, dst_stride);
-        }
-        for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst_data[dst_index + i * dst_stride] = src_data[src_index];
-        }
-      }
-    } else {
-      if (dst_stride == 1) {
-        // GATHER
-        const Index packet_size = internal::unpacket_traits<Packet>::size;
-        const Index vectorized_size =
-            (num_coeff_to_copy / packet_size) * packet_size;
-        for (Index i = 0; i < vectorized_size; i += packet_size) {
-          Packet p = internal::pgather<Scalar, Packet>(
-              src_data + src_index + i * src_stride, src_stride);
-          internal::pstoreu<Scalar, Packet>(dst_data + dst_index + i, p);
-        }
-        for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
-          dst_data[dst_index + i] = src_data[src_index + i * src_stride];
-        }
-      } else {
-        // RANDOM
-        for (Index i = 0; i < num_coeff_to_copy; ++i) {
-          dst_data[dst_index + i * dst_stride] =
-              src_data[src_index + i * src_stride];
-        }
-      }
-    }
+    using Src = const Eigen::Array<Scalar, Dynamic, 1>;
+    using Dst = Eigen::Array<Scalar, Dynamic, 1>;
+
+    using SrcMap = Eigen::Map<Src, 0, InnerStride<>>;
+    using DstMap = Eigen::Map<Dst, 0, InnerStride<>>;
+
+    const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride));
+    DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride));
+
+    dst = src;
   }
 };
 
@@ -249,34 +174,34 @@ struct TensorBlockCopyOp<Scalar, Index, true> {
  * This class is responsible for copying data between a tensor and a tensor
  * block.
  */
-template <typename Scalar, typename Index, int NumDims, int Layout,
-          bool Vectorizable, bool BlockRead>
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout,
+          bool BlockRead>
 class TensorBlockIO {
  public:
-  typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
+  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
       TensorBlock;
-  typedef typename internal::TensorBlockCopyOp<Scalar, Index, Vectorizable>
+  typedef typename internal::TensorBlockCopyOp<Scalar, StorageIndex>
       TensorBlockCopyOp;
 
  protected:
   struct BlockIteratorState {
-    Index input_stride;
-    Index output_stride;
-    Index input_span;
-    Index output_span;
-    Index size;
-    Index count;
+    StorageIndex input_stride;
+    StorageIndex output_stride;
+    StorageIndex input_span;
+    StorageIndex output_span;
+    StorageIndex size;
+    StorageIndex count;
   };
 
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Copy(
-      const TensorBlock& block, Index first_coeff_index,
-      const array<Index, NumDims>& tensor_to_block_dim_map,
-      const array<Index, NumDims>& tensor_strides, const Scalar* src_data,
+      const TensorBlock& block, StorageIndex first_coeff_index,
+      const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
+      const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data,
       Scalar* dst_data) {
     // Find the innermost tensor dimension whose size is not 1. This is the
     // effective inner dim. If all dimensions are of size 1, then fallback to
     // using the actual innermost dim to avoid out-of-bound access.
-    Index num_size_one_inner_dims = 0;
+    StorageIndex num_size_one_inner_dims = 0;
     for (int i = 0; i < NumDims; ++i) {
       const int dim = cond<Layout>()(i, NumDims - i - 1);
       if (block.block_sizes()[tensor_to_block_dim_map[dim]] != 1) {
@@ -285,16 +210,16 @@ class TensorBlockIO {
       }
     }
     // Calculate strides and dimensions.
-    const Index tensor_stride1_dim = cond<Layout>()(
+    const StorageIndex tensor_stride1_dim = cond<Layout>()(
         num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1);
-    const Index block_dim_for_tensor_stride1_dim =
+    const StorageIndex block_dim_for_tensor_stride1_dim =
         NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim];
     size_t block_inner_dim_size =
         NumDims == 0 ? 1
                      : block.block_sizes()[block_dim_for_tensor_stride1_dim];
     for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
       const int dim = cond<Layout>()(i, NumDims - i - 1);
-      const Index block_stride =
+      const StorageIndex block_stride =
           block.block_strides()[tensor_to_block_dim_map[dim]];
       if (block_inner_dim_size == block_stride &&
           block_stride == tensor_strides[dim]) {
@@ -306,10 +231,10 @@ class TensorBlockIO {
       }
     }
 
-    Index inputIndex;
-    Index outputIndex;
-    Index input_stride;
-    Index output_stride;
+    StorageIndex inputIndex;
+    StorageIndex outputIndex;
+    StorageIndex input_stride;
+    StorageIndex output_stride;
 
     // Setup strides to read/write along the tensor's stride1 dimension.
     if (BlockRead) {
@@ -337,7 +262,7 @@ class TensorBlockIO {
     int num_squeezed_dims = 0;
     for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
       const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
-      const Index size = block.block_sizes()[tensor_to_block_dim_map[dim]];
+      const StorageIndex size = block.block_sizes()[tensor_to_block_dim_map[dim]];
       if (size == 1) {
         continue;
       }
@@ -362,9 +287,9 @@ class TensorBlockIO {
     }
 
     // Iterate copying data from src to dst.
-    const Index block_total_size =
+    const StorageIndex block_total_size =
         NumDims == 0 ? 1 : block.block_sizes().TotalSize();
-    for (Index i = 0; i < block_total_size; i += block_inner_dim_size) {
+    for (StorageIndex i = 0; i < block_total_size; i += block_inner_dim_size) {
       TensorBlockCopyOp::Run(block_inner_dim_size, outputIndex, output_stride,
                              dst_data, inputIndex, input_stride, src_data);
       // Update index.
@@ -391,19 +316,18 @@ class TensorBlockIO {
  * This class is responsible for reading a tensor block.
  *
  */
-template <typename Scalar, typename Index, int NumDims, int Layout,
-          bool Vectorizable>
-class TensorBlockReader
-    : public TensorBlockIO<Scalar, Index, NumDims, Layout, Vectorizable, true> {
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
+class TensorBlockReader : public TensorBlockIO<Scalar, StorageIndex, NumDims,
+                                               Layout, /*BlockRead=*/true> {
  public:
-  typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
+  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
       TensorBlock;
-  typedef TensorBlockIO<Scalar, Index, NumDims, Layout, Vectorizable, true>
+  typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/true>
       Base;
 
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
       TensorBlock* block, const Scalar* src_data) {
-    array<Index, NumDims> tensor_to_block_dim_map;
+    array<StorageIndex, NumDims> tensor_to_block_dim_map;
     for (int i = 0; i < NumDims; ++i) {
       tensor_to_block_dim_map[i] = i;
     }
@@ -412,9 +336,9 @@ class TensorBlockReader
   }
 
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      TensorBlock* block, Index first_coeff_index,
-      const array<Index, NumDims>& tensor_to_block_dim_map,
-      const array<Index, NumDims>& tensor_strides, const Scalar* src_data) {
+      TensorBlock* block, StorageIndex first_coeff_index,
+      const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
+      const array<StorageIndex, NumDims>& tensor_strides, const Scalar* src_data) {
     Base::Copy(*block, first_coeff_index, tensor_to_block_dim_map,
                tensor_strides, src_data, block->data());
   }
@@ -429,19 +353,18 @@ class TensorBlockReader
  * This class is responsible for writing a tensor block.
  *
  */
-template <typename Scalar, typename Index, int NumDims, int Layout,
-          bool Vectorizable>
-class TensorBlockWriter : public TensorBlockIO<Scalar, Index, NumDims, Layout,
-                                               Vectorizable, false> {
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
+class TensorBlockWriter : public TensorBlockIO<Scalar, StorageIndex, NumDims,
+                                               Layout, /*BlockRead=*/false> {
  public:
-  typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
+  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
       TensorBlock;
-  typedef TensorBlockIO<Scalar, Index, NumDims, Layout, Vectorizable, false>
+  typedef TensorBlockIO<Scalar, StorageIndex, NumDims, Layout, /*BlockRead=*/false>
       Base;
 
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
       const TensorBlock& block, Scalar* dst_data) {
-    array<Index, NumDims> tensor_to_block_dim_map;
+    array<StorageIndex, NumDims> tensor_to_block_dim_map;
     for (int i = 0; i < NumDims; ++i) {
       tensor_to_block_dim_map[i] = i;
     }
@@ -450,9 +373,9 @@ class TensorBlockWriter : public TensorBlockIO<Scalar, Index, NumDims, Layout,
   }
 
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const TensorBlock& block, Index first_coeff_index,
-      const array<Index, NumDims>& tensor_to_block_dim_map,
-      const array<Index, NumDims>& tensor_strides, Scalar* dst_data) {
+      const TensorBlock& block, StorageIndex first_coeff_index,
+      const array<StorageIndex, NumDims>& tensor_to_block_dim_map,
+      const array<StorageIndex, NumDims>& tensor_strides, Scalar* dst_data) {
     Base::Copy(block, first_coeff_index, tensor_to_block_dim_map,
                tensor_strides, block.data(), dst_data);
   }
@@ -468,67 +391,34 @@ class TensorBlockWriter : public TensorBlockIO<Scalar, Index, NumDims, Layout,
  * result of the cwise binary op to the strided output array.
  *
  */
-template <bool Vectorizable>
 struct TensorBlockCwiseBinaryOp {
-  template <typename Index, typename BinaryFunctor, typename OutputScalar,
+  template <typename StorageIndex, typename BinaryFunctor, typename OutputScalar,
             typename LeftScalar, typename RightScalar>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const BinaryFunctor& functor, const Index num_coeff,
-      const Index output_index, const Index output_stride,
-      OutputScalar* output_data, const Index left_index,
-      const Index left_stride, const LeftScalar* left_data,
-      const Index right_index, const Index right_stride,
+      const BinaryFunctor& functor, const StorageIndex num_coeff,
+      const StorageIndex output_index, const StorageIndex output_stride,
+      OutputScalar* output_data, const StorageIndex left_index,
+      const StorageIndex left_stride, const LeftScalar* left_data,
+      const StorageIndex right_index, const StorageIndex right_stride,
       const RightScalar* right_data) {
-    for (Index i = 0; i < num_coeff; ++i) {
-      output_data[output_index + i * output_stride] =
-          functor(left_data[left_index + i * left_stride],
-                  right_data[right_index + i * right_stride]);
-    }
-  }
-};
+    using Lhs = const Eigen::Array<LeftScalar, Dynamic, 1>;
+    using Rhs = const Eigen::Array<RightScalar, Dynamic, 1>;
+    using Out = Eigen::Array<OutputScalar, Dynamic, 1>;
 
-template <>
-struct TensorBlockCwiseBinaryOp<true> {
-  template <typename Index, typename BinaryFunctor, typename OutputScalar,
-            typename LeftScalar, typename RightScalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const BinaryFunctor& functor, const Index num_coeff,
-      const Index output_index, const Index output_stride,
-      OutputScalar* output_data, const Index left_index,
-      const Index left_stride, const LeftScalar* left_data,
-      const Index right_index, const Index right_stride,
-      const RightScalar* right_data) {
-    EIGEN_STATIC_ASSERT(functor_traits<BinaryFunctor>::PacketAccess,
-                        YOU_MADE_A_PROGRAMMING_MISTAKE);
-    typedef typename packet_traits<OutputScalar>::type OutputPacket;
-    typedef typename packet_traits<LeftScalar>::type LeftPacket;
-    typedef typename packet_traits<RightScalar>::type RightPacket;
-    const Index packet_size = unpacket_traits<OutputPacket>::size;
-    EIGEN_STATIC_ASSERT(unpacket_traits<LeftPacket>::size == packet_size,
-                        YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT(unpacket_traits<RightPacket>::size == packet_size,
-                        YOU_MADE_A_PROGRAMMING_MISTAKE);
-    const Index vectorized_size = (num_coeff / packet_size) * packet_size;
-    if (output_stride != 1 || left_stride != 1 || right_stride != 1) {
-      TensorBlockCwiseBinaryOp<false>::Run(
-          functor, num_coeff, output_index, output_stride, output_data,
-          left_index, left_stride, left_data, right_index, right_stride,
-          right_data);
-      return;
-    }
-    // Vectorization for the most common case.
-    for (Index i = 0; i < vectorized_size; i += packet_size) {
-      LeftPacket l = internal::ploadu<LeftPacket>(left_data + left_index + i);
-      RightPacket r =
-          internal::ploadu<RightPacket>(right_data + right_index + i);
-      OutputPacket p = functor.packetOp(l, r);
-      internal::pstoreu<OutputScalar, OutputPacket>(
-          output_data + output_index + i, p);
-    }
-    for (Index i = vectorized_size; i < num_coeff; ++i) {
-      output_data[output_index + i] =
-          functor(left_data[left_index + i], right_data[right_index + i]);
-    }
+    using LhsMap = Eigen::Map<Lhs, 0, InnerStride<>>;
+    using RhsMap = Eigen::Map<Rhs, 0, InnerStride<>>;
+    using OutMap = Eigen::Map<Out, 0, InnerStride<>>;
+
+    const LeftScalar* lhs_base = &left_data[left_index];
+    const RightScalar* rhs_base = &right_data[right_index];
+    OutputScalar* out_base = &output_data[output_index];
+
+    const LhsMap lhs(lhs_base, num_coeff, InnerStride<>(left_stride));
+    const RhsMap rhs(rhs_base, num_coeff, InnerStride<>(right_stride));
+    OutMap out(out_base, num_coeff, InnerStride<>(output_stride));
+
+    out =
+        Eigen::CwiseBinaryOp<BinaryFunctor, LhsMap, RhsMap>(lhs, rhs, functor);
   }
 };
 
@@ -541,28 +431,26 @@ struct TensorBlockCwiseBinaryOp<true> {
  * This class carries out the binary op on given blocks.
  *
  */
-template <typename BinaryFunctor, typename Index, typename OutputScalar,
+template <typename BinaryFunctor, typename StorageIndex, typename OutputScalar,
           int NumDims, int Layout>
 struct TensorBlockCwiseBinaryIO {
-  typedef typename internal::TensorBlock<OutputScalar, Index, NumDims,
+  typedef typename internal::TensorBlock<OutputScalar, StorageIndex, NumDims,
                                          Layout>::Dimensions Dimensions;
-  typedef internal::TensorBlockCwiseBinaryOp<
-      functor_traits<BinaryFunctor>::PacketAccess>
-      TensorBlockCwiseBinaryOp;
 
   struct BlockIteratorState {
-    Index output_stride, output_span;
-    Index left_stride, left_span;
-    Index right_stride, right_span;
-    Index size, count;
+    StorageIndex output_stride, output_span;
+    StorageIndex left_stride, left_span;
+    StorageIndex right_stride, right_span;
+    StorageIndex size, count;
   };
 
   template <typename LeftScalar, typename RightScalar>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
       const BinaryFunctor& functor, const Dimensions& block_sizes,
       const Dimensions& block_strides, OutputScalar* output_data,
-      const array<Index, NumDims>& left_strides, const LeftScalar* left_data,
-      const array<Index, NumDims>& right_strides,
+      const array<StorageIndex, NumDims>& left_strides,
+      const LeftScalar* left_data,
+      const array<StorageIndex, NumDims>& right_strides,
       const RightScalar* right_data) {
     // Find the innermost dimension whose size is not 1. This is the effective
     // inner dim. If all dimensions are of size 1, fallback to using the actual
@@ -580,7 +468,7 @@ struct TensorBlockCwiseBinaryIO {
         NumDims == 0 ? 1
                      : cond<Layout>()(num_size_one_inner_dims,
                                       NumDims - num_size_one_inner_dims - 1);
-    Index inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
+    StorageIndex inner_dim_size = NumDims == 0 ? 1 : block_sizes[inner_dim];
     for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) {
       const int dim = cond<Layout>()(i, NumDims - i - 1);
       // Merge multiple inner dims into one for larger inner dim size (i.e.
@@ -595,10 +483,12 @@ struct TensorBlockCwiseBinaryIO {
       }
     }
 
-    Index output_index = 0, left_index = 0, right_index = 0;
-    const Index output_stride = NumDims == 0 ? 1 : block_strides[inner_dim];
-    const Index left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
-    const Index right_stride = NumDims == 0 ? 1 : right_strides[inner_dim];
+    StorageIndex output_index = 0, left_index = 0, right_index = 0;
+    const StorageIndex output_stride =
+        NumDims == 0 ? 1 : block_strides[inner_dim];
+    const StorageIndex left_stride = NumDims == 0 ? 1 : left_strides[inner_dim];
+    const StorageIndex right_stride =
+        NumDims == 0 ? 1 : right_strides[inner_dim];
 
     const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
     array<BlockIteratorState, at_least_1_dim> block_iter_state;
@@ -607,7 +497,7 @@ struct TensorBlockCwiseBinaryIO {
     int num_squeezed_dims = 0;
     for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
       const int dim = cond<Layout>()(i + 1, NumDims - i - 2);
-      const Index size = block_sizes[dim];
+      const StorageIndex size = block_sizes[dim];
       if (size == 1) {
         continue;
       }
@@ -624,8 +514,9 @@ struct TensorBlockCwiseBinaryIO {
     }
 
     // Compute cwise binary op.
-    const Index block_total_size = NumDims == 0 ? 1 : block_sizes.TotalSize();
-    for (Index i = 0; i < block_total_size; i += inner_dim_size) {
+    const StorageIndex block_total_size =
+        NumDims == 0 ? 1 : block_sizes.TotalSize();
+    for (StorageIndex i = 0; i < block_total_size; i += inner_dim_size) {
       TensorBlockCwiseBinaryOp::Run(functor, inner_dim_size, output_index,
                                     output_stride, output_data, left_index,
                                     left_stride, left_data, right_index,
@@ -661,10 +552,10 @@ struct TensorBlockCwiseBinaryIO {
 template <class ArgType, class Device>
 struct TensorBlockView {
   typedef TensorEvaluator<ArgType, Device> Impl;
-  typedef typename Impl::Index Index;
+  typedef typename Impl::Index StorageIndex;
   typedef typename remove_const<typename Impl::Scalar>::type Scalar;
   static const int NumDims = array_size<typename Impl::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
 
   // Constructs a TensorBlockView for `impl`. `block` is only used for for
   // specifying the start offset, shape, and strides of the block.
@@ -701,7 +592,7 @@ struct TensorBlockView {
           }
         }
       }
-      TensorBlock<Scalar, Index, NumDims, Impl::Layout> input_block(
+      TensorBlock<Scalar, StorageIndex, NumDims, Impl::Layout> input_block(
           block.first_coeff_index(), m_block_sizes, m_block_strides,
           block.tensor_strides(), m_allocated_data);
       impl.block(&input_block);
@@ -733,21 +624,21 @@ struct TensorBlockView {
  *
  * This class is responsible for iterating over the blocks of a tensor.
  */
-template <typename Scalar, typename Index, int NumDims, int Layout>
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
 class TensorBlockMapper {
  public:
-  typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
+  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
       TensorBlock;
-  typedef DSizes<Index, NumDims> Dimensions;
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
 
   TensorBlockMapper(const Dimensions& dims,
                     const TensorBlockShapeType block_shape,
-                    size_t min_target_size)
+                    Index min_target_size)
       : m_dimensions(dims),
         m_block_dim_sizes(BlockDimensions(dims, block_shape, min_target_size)) {
     // Calculate block counts by dimension and total block count.
-    DSizes<Index, NumDims> block_count;
-    for (size_t i = 0; i < block_count.rank(); ++i) {
+    DSizes<StorageIndex, NumDims> block_count;
+    for (Index i = 0; i < block_count.rank(); ++i) {
       block_count[i] = divup(m_dimensions[i], m_block_dim_sizes[i]);
     }
     m_total_block_count = array_prod(block_count);
@@ -773,15 +664,15 @@ class TensorBlockMapper {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  GetBlockForIndex(Index block_index, Scalar* data) const {
-    Index first_coeff_index = 0;
-    DSizes<Index, NumDims> coords;
-    DSizes<Index, NumDims> sizes;
-    DSizes<Index, NumDims> strides;
+  GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
+    StorageIndex first_coeff_index = 0;
+    DSizes<StorageIndex, NumDims> coords;
+    DSizes<StorageIndex, NumDims> sizes;
+    DSizes<StorageIndex, NumDims> strides;
     if (NumDims > 0) {
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         for (int i = NumDims - 1; i > 0; --i) {
-          const Index idx = block_index / m_block_strides[i];
+          const StorageIndex idx = block_index / m_block_strides[i];
           coords[i] = idx * m_block_dim_sizes[i];
           sizes[i] =
               numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
@@ -799,7 +690,7 @@ class TensorBlockMapper {
         }
       } else {
         for (int i = 0; i < NumDims - 1; ++i) {
-          const Index idx = block_index / m_block_strides[i];
+          const StorageIndex idx = block_index / m_block_strides[i];
           coords[i] = idx * m_block_dim_sizes[i];
           sizes[i] =
               numext::mini((m_dimensions[i] - coords[i]), m_block_dim_sizes[i]);
@@ -824,19 +715,20 @@ class TensorBlockMapper {
                        data);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
     return m_total_block_count;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index block_dims_total_size() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex
+  block_dims_total_size() const {
     return m_block_dim_sizes.TotalSize();
   }
 
  private:
   static Dimensions BlockDimensions(const Dimensions& tensor_dims,
                                     const TensorBlockShapeType block_shape,
-                                    size_t min_target_size) {
-    min_target_size = numext::maxi<size_t>(1, min_target_size);
+                                    Index min_target_size) {
+    min_target_size = numext::maxi<Index>(1, min_target_size);
 
     // If tensor fully fits into the target size, we'll treat it a single block.
     Dimensions block_dim_sizes = tensor_dims;
@@ -865,14 +757,14 @@ class TensorBlockMapper {
               dim_size_target, static_cast<size_t>(tensor_dims[i]));
         }
         // Add any un-allocated coefficients to inner dimension(s).
-        Index total_size = block_dim_sizes.TotalSize();
+        StorageIndex total_size = block_dim_sizes.TotalSize();
         for (int i = 0; i < NumDims; ++i) {
           const int dim = cond<Layout>()(i, NumDims - i - 1);
           if (block_dim_sizes[dim] < tensor_dims[dim]) {
-            const Index total_size_other_dims =
+            const StorageIndex total_size_other_dims =
                 total_size / block_dim_sizes[dim];
-            const Index alloc_avail =
-                divup<Index>(min_target_size, total_size_other_dims);
+            const StorageIndex alloc_avail =
+                divup<StorageIndex>(min_target_size, total_size_other_dims);
             if (alloc_avail == block_dim_sizes[dim]) {
               // Insufficient excess coefficients to allocate.
               break;
@@ -882,14 +774,14 @@ class TensorBlockMapper {
           }
         }
       } else if (block_shape == TensorBlockShapeType::kSkewedInnerDims) {
-        Index coeff_to_allocate = min_target_size;
+        StorageIndex coeff_to_allocate = min_target_size;
         for (int i = 0; i < NumDims; ++i) {
           const int dim = cond<Layout>()(i, NumDims - i - 1);
           block_dim_sizes[dim] =
               numext::mini(coeff_to_allocate, tensor_dims[dim]);
-          coeff_to_allocate =
-              divup(coeff_to_allocate,
-                    numext::maxi(static_cast<Index>(1), block_dim_sizes[dim]));
+          coeff_to_allocate = divup(
+              coeff_to_allocate,
+              numext::maxi(static_cast<StorageIndex>(1), block_dim_sizes[dim]));
         }
         eigen_assert(coeff_to_allocate == 1);
       } else {
@@ -908,7 +800,7 @@ class TensorBlockMapper {
   Dimensions m_block_dim_sizes;
   Dimensions m_block_strides;
   Dimensions m_tensor_strides;
-  Index m_total_block_count;
+  StorageIndex m_total_block_count;
 };
 
 /**
@@ -923,12 +815,12 @@ class TensorBlockMapper {
  * processed together.
  *
  */
-template <typename Scalar, typename Index, int NumDims, int Layout>
+template <typename Scalar, typename StorageIndex, int NumDims, int Layout>
 class TensorSliceBlockMapper {
  public:
-  typedef typename internal::TensorBlock<Scalar, Index, NumDims, Layout>
+  typedef typename internal::TensorBlock<Scalar, StorageIndex, NumDims, Layout>
       TensorBlock;
-  typedef DSizes<Index, NumDims> Dimensions;
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
 
   TensorSliceBlockMapper(const Dimensions& tensor_dims,
                          const Dimensions& tensor_slice_offsets,
@@ -942,7 +834,7 @@ class TensorSliceBlockMapper {
         m_block_stride_order(block_stride_order),
         m_total_block_count(1) {
     // Calculate block counts by dimension and total block count.
-    DSizes<Index, NumDims> block_count;
+    DSizes<StorageIndex, NumDims> block_count;
     for (size_t i = 0; i < block_count.rank(); ++i) {
       block_count[i] = divup(m_tensor_slice_extents[i], m_block_dim_sizes[i]);
     }
@@ -969,11 +861,11 @@ class TensorSliceBlockMapper {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  GetBlockForIndex(Index block_index, Scalar* data) const {
-    Index first_coeff_index = 0;
-    DSizes<Index, NumDims> coords;
-    DSizes<Index, NumDims> sizes;
-    DSizes<Index, NumDims> strides;
+  GetBlockForIndex(StorageIndex block_index, Scalar* data) const {
+    StorageIndex first_coeff_index = 0;
+    DSizes<StorageIndex, NumDims> coords;
+    DSizes<StorageIndex, NumDims> sizes;
+    DSizes<StorageIndex, NumDims> strides;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = block_index / m_block_strides[i];
@@ -991,16 +883,16 @@ class TensorSliceBlockMapper {
           m_block_dim_sizes[0]);
       first_coeff_index += coords[0] * m_tensor_strides[0];
 
-      Index prev_dim = m_block_stride_order[0];
+      StorageIndex prev_dim = m_block_stride_order[0];
       strides[prev_dim] = 1;
       for (int i = 1; i < NumDims; ++i) {
-        const Index curr_dim = m_block_stride_order[i];
+        const StorageIndex curr_dim = m_block_stride_order[i];
         strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
         prev_dim = curr_dim;
       }
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = block_index / m_block_strides[i];
+        const StorageIndex idx = block_index / m_block_strides[i];
         coords[i] = m_tensor_slice_offsets[i] + idx * m_block_dim_sizes[i];
         sizes[i] = numext::mini(
             m_tensor_slice_offsets[i] + m_tensor_slice_extents[i] - coords[i],
@@ -1016,10 +908,10 @@ class TensorSliceBlockMapper {
           m_block_dim_sizes[NumDims - 1]);
       first_coeff_index += coords[NumDims - 1] * m_tensor_strides[NumDims - 1];
 
-      Index prev_dim = m_block_stride_order[NumDims - 1];
+      StorageIndex prev_dim = m_block_stride_order[NumDims - 1];
       strides[prev_dim] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
-        const Index curr_dim = m_block_stride_order[i];
+        const StorageIndex curr_dim = m_block_stride_order[i];
         strides[curr_dim] = strides[prev_dim] * sizes[prev_dim];
         prev_dim = curr_dim;
       }
@@ -1029,7 +921,7 @@ class TensorSliceBlockMapper {
                        data);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index total_block_count() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex total_block_count() const {
     return m_total_block_count;
   }
 
@@ -1041,7 +933,7 @@ class TensorSliceBlockMapper {
   Dimensions m_block_dim_sizes;
   Dimensions m_block_stride_order;
   Dimensions m_block_strides;
-  Index m_total_block_count;
+  StorageIndex m_total_block_count;
 };
 
 }  // namespace internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 7ff0d323b..343ab6269 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -1,5 +1,4 @@
 // This file is part of Eigen, a lightweight C++ template library
-// This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index ba02802d2..f9a1bd68c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -51,12 +51,10 @@ struct TensorEvaluator
       typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
       TensorBlock;
   typedef typename internal::TensorBlockReader<
-      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
-      PacketAccess>
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
       TensorBlockReader;
   typedef typename internal::TensorBlockWriter<
-      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
-      PacketAccess>
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
       TensorBlockWriter;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
@@ -204,8 +202,7 @@ struct TensorEvaluator<const Derived, Device>
       typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
       TensorBlock;
   typedef typename internal::TensorBlockReader<
-      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
-      PacketAccess>
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
       TensorBlockReader;
 
   // Used for accessor extraction in SYCL Managed TensorMap:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 024de3696..ac5afd891 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -36,15 +36,16 @@ template <typename Expression, typename Device, bool Vectorizable,
           bool Tileable>
 class TensorExecutor {
  public:
-  typedef typename Expression::Index Index;
+  using StorageIndex = typename Expression::Index;
+
   EIGEN_DEVICE_FUNC
   static inline void run(const Expression& expr,
                          const Device& device = Device()) {
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
-      const Index size = array_prod(evaluator.dimensions());
-      for (Index i = 0; i < size; ++i) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      for (StorageIndex i = 0; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -56,35 +57,36 @@ class TensorExecutor {
  * Process all the data with a single cpu thread, using vectorized instructions.
  */
 template <typename Expression>
-class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable*/ false> {
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
+                     /*Tileable*/ false> {
  public:
-  typedef typename Expression::Index Index;
+  using StorageIndex = typename Expression::Index;
 
   EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
-  {
+  static inline void run(const Expression& expr,
+                         const DefaultDevice& device = DefaultDevice()) {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
       const int PacketSize = unpacket_traits<typename TensorEvaluator<
           Expression, DefaultDevice>::PacketReturnType>::size;
 
       // Give compiler a strong possibility to unroll the loop. But don't insist
       // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
-      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      const StorageIndex UnrolledSize =
+          (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
-      const Index VectorizedSize = (size / PacketSize) * PacketSize;
-      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+      const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
+      for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
       }
-      for (Index i = VectorizedSize; i < size; ++i) {
+      for (StorageIndex i = VectorizedSize; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -97,42 +99,41 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable
  * sizing a block to fit L1 cache we get better cache performance.
  */
 template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> {
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                     /*Tileable*/ true> {
  public:
-  typedef typename Expression::Index Index;
+  using Scalar = typename traits<Expression>::Scalar;
+  using ScalarNoConst = typename remove_const<Scalar>::type;
+
+  using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
+  using StorageIndex = typename traits<Expression>::Index;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
 
   EIGEN_DEVICE_FUNC
   static inline void run(const Expression& expr,
                          const DefaultDevice& device = DefaultDevice()) {
-    using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
-
-    using Index = typename traits<Expression>::Index;
-    const int NumDims = traits<Expression>::NumDimensions;
-
-    using Scalar = typename traits<Expression>::Scalar;
-    using ScalarNoConst = typename remove_const<Scalar>::type;
-
     using TensorBlock =
-        TensorBlock<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
-    using TensorBlockMapper =
-        TensorBlockMapper<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
+        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+    using TensorBlockMapper = TensorBlockMapper<ScalarNoConst, StorageIndex,
+                                                NumDims, Evaluator::Layout>;
 
     Evaluator evaluator(expr, device);
-    std::size_t total_size = array_prod(evaluator.dimensions());
-    std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+    Index total_size = array_prod(evaluator.dimensions());
+    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
 
     if (total_size < cache_size) {
       // TODO(andydavis) Reduce block management overhead for small tensors.
       // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
       internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
-                               false>::run(expr, device);
+                               /*Tileable*/ false>::run(expr, device);
       return;
     }
 
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign) {
       // Size tensor blocks to fit in cache (or requested target block size).
-      size_t block_total_size = numext::mini(cache_size, total_size);
+      Index block_total_size = numext::mini(cache_size, total_size);
       TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
       // Query expression tree for desired block size/shape.
       std::vector<TensorOpResourceRequirements> resources;
@@ -146,8 +147,8 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true>
       Scalar* data = static_cast<Scalar*>(
           device.allocate(block_total_size * sizeof(Scalar)));
 
-      const Index total_block_count = block_mapper.total_block_count();
-      for (Index i = 0; i < total_block_count; ++i) {
+      const StorageIndex total_block_count = block_mapper.total_block_count();
+      for (StorageIndex i = 0; i < total_block_count; ++i) {
         TensorBlock block = block_mapper.GetBlockForIndex(i, data);
         evaluator.evalBlock(&block);
       }
@@ -162,37 +163,38 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true>
  * executed on a single core.
  */
 #ifdef EIGEN_USE_THREADS
-template <typename Evaluator, typename Index, bool Vectorizable>
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EvalRange {
-  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+  static void run(Evaluator* evaluator_in, const StorageIndex first,
+                  const StorageIndex last) {
     Evaluator evaluator = *evaluator_in;
     eigen_assert(last >= first);
-    for (Index i = first; i < last; ++i) {
+    for (StorageIndex i = first; i < last; ++i) {
       evaluator.evalScalar(i);
     }
   }
 
-  static Index alignBlockSize(Index size) {
-    return size;
-  }
+  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
 };
 
-template <typename Evaluator, typename Index>
-struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
-  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+template <typename Evaluator, typename StorageIndex>
+struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
+  static const int PacketSize =
+      unpacket_traits<typename Evaluator::PacketReturnType>::size;
 
-  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+  static void run(Evaluator* evaluator_in, const StorageIndex first,
+                  const StorageIndex last) {
     Evaluator evaluator = *evaluator_in;
     eigen_assert(last >= first);
-    Index i = first;
+    StorageIndex i = first;
     if (last - first >= PacketSize) {
       eigen_assert(first % PacketSize == 0);
-      Index last_chunk_offset = last - 4 * PacketSize;
+      StorageIndex last_chunk_offset = last - 4 * PacketSize;
       // Give compiler a strong possibility to unroll the loop. But don't insist
       // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      for (; i <= last_chunk_offset; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      for (; i <= last_chunk_offset; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
@@ -206,7 +208,7 @@ struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
     }
   }
 
-  static Index alignBlockSize(Index size) {
+  static StorageIndex alignBlockSize(StorageIndex size) {
     // Align block size to packet size and account for unrolling in run above.
     if (size >= 16 * PacketSize) {
       return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
@@ -219,24 +221,24 @@ struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
 template <typename Expression, bool Vectorizable, bool Tileable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
  public:
-  typedef typename Expression::Index Index;
+  using StorageIndex = typename Expression::Index;
 
   static inline void run(const Expression& expr,
                          const ThreadPoolDevice& device) {
     typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-    typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange;
+    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
 
     Evaluator evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
-      const Index PacketSize =
+      const StorageIndex PacketSize =
           Vectorizable
               ? unpacket_traits<typename Evaluator::PacketReturnType>::size
               : 1;
-      const Index size = array_prod(evaluator.dimensions());
+      const StorageIndex size = array_prod(evaluator.dimensions());
       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
                          EvalRange::alignBlockSize,
-                         [&evaluator](Index first, Index last) {
+                         [&evaluator](StorageIndex first, StorageIndex last) {
                            EvalRange::run(&evaluator, first, last);
                          });
     }
@@ -247,24 +249,24 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
  public:
-  typedef typename Expression::Index Index;
+  using Scalar = typename traits<Expression>::Scalar;
+  using ScalarNoConst = typename remove_const<Scalar>::type;
 
-  static inline void run(const Expression& expr,
-                         const ThreadPoolDevice& device) {
-    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-    typedef typename internal::remove_const<
-        typename traits<Expression>::Scalar>::type Scalar;
-    typedef typename traits<Expression>::Index Index;
+  using Evaluator = TensorEvaluator<Expression, ThreadPoolDevice>;
+  using StorageIndex = typename traits<Expression>::Index;
 
-    static const int NumDims = traits<Expression>::NumDimensions;
+  static const int NumDims = traits<Expression>::NumDimensions;
 
-    typedef TensorBlock<Scalar, Index, NumDims, Evaluator::Layout> TensorBlock;
-    typedef TensorBlockMapper<Scalar, Index, NumDims, Evaluator::Layout>
-        TensorBlockMapper;
+  static inline void run(const Expression& expr,
+                         const ThreadPoolDevice& device) {
+    using TensorBlock =
+        TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
+    using TensorBlockMapper =
+        TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout>;
 
     Evaluator evaluator(expr, device);
-    std::size_t total_size = array_prod(evaluator.dimensions());
-    std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+    StorageIndex total_size = array_prod(evaluator.dimensions());
+    StorageIndex cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
     if (total_size < cache_size) {
       // TODO(andydavis) Reduce block management overhead for small tensors.
       internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
@@ -276,7 +278,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
       TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
-      size_t block_total_size = 0;
+      Index block_total_size = 0;
       // Query expression tree for desired block size/shape.
       std::vector<internal::TensorOpResourceRequirements> resources;
       evaluator.getResourceRequirements(&resources);
@@ -296,15 +298,16 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
       void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
       device.parallelFor(
           block_mapper.total_block_count(), cost * block_size,
-          [=, &device, &evaluator, &block_mapper](Index first, Index last) {
+          [=, &device, &evaluator, &block_mapper](StorageIndex first,
+                                                  StorageIndex last) {
             // currentThreadId() returns -1 if called from a thread not in the
-            // threadpool, such as the main thread dispatching Eigen
+            // thread pool, such as the main thread dispatching Eigen
             // expressions.
             const int thread_idx = device.currentThreadId();
             eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
             Scalar* thread_buf = reinterpret_cast<Scalar*>(
                 static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
-            for (Index i = first; i < last; ++i) {
+            for (StorageIndex i = first; i < last; ++i) {
               auto block = block_mapper.GetBlockForIndex(i, thread_buf);
               evaluator.evalBlock(&block);
             }
@@ -324,51 +327,51 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
 template <typename Expression, bool Vectorizable, bool Tileable>
 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
  public:
-  typedef typename Expression::Index Index;
+  typedef typename Expression::Index StorageIndex;
   static void run(const Expression& expr, const GpuDevice& device);
 };
 
 
 #if defined(EIGEN_GPUCC)
-template <typename Evaluator, typename Index, bool Vectorizable>
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EigenMetaKernelEval {
   static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, Index first, Index last, Index step_size) {
-    for (Index i = first; i < last; i += step_size) {
+  void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
+    for (StorageIndex i = first; i < last; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename Index>
-struct EigenMetaKernelEval<Evaluator, Index, true> {
+template <typename Evaluator, typename StorageIndex>
+struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
   static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, Index first, Index last, Index step_size) {
-    const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-    const Index vectorized_size = (last / PacketSize) * PacketSize;
-    const Index vectorized_step_size = step_size * PacketSize;
+  void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
+    const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const StorageIndex vectorized_size = (last / PacketSize) * PacketSize;
+    const StorageIndex vectorized_step_size = step_size * PacketSize;
 
     // Use the vector path
-    for (Index i = first * PacketSize; i < vectorized_size;
+    for (StorageIndex i = first * PacketSize; i < vectorized_size;
          i += vectorized_step_size) {
       eval.evalPacket(i);
     }
-    for (Index i = vectorized_size + first; i < last; i += step_size) {
+    for (StorageIndex i = vectorized_size + first; i < last; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename Index>
+template <typename Evaluator, typename StorageIndex>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator eval, Index size) {
+EigenMetaKernel(Evaluator eval, StorageIndex size) {
 
-  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index step_size = blockDim.x * gridDim.x;
+  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const StorageIndex step_size = blockDim.x * gridDim.x;
 
   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
-  EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
+  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
 }
 
 /*static*/
@@ -382,12 +385,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
     const int block_size = device.maxGpuThreadsPerBlock();
     const int max_blocks = device.getNumGpuMultiProcessors() *
                            device.maxGpuThreadsPerMultiProcessor() / block_size;
-    const Index size = array_prod(evaluator.dimensions());
+    const StorageIndex size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
 
     LAUNCH_GPU_KERNEL(
-        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
         num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
index 416b686e4..6feeff231 100644
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -37,6 +37,31 @@ static std::size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) {
   return internal::random<int>(1, dims.TotalSize());
 }
 
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims() {
+  array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(1, 20);
+  }
+  return DSizes<Index, NumDims>(dims);
+};
+
+/** Dummy data type to test TensorBlock copy ops. */
+struct Data {
+  Data() : Data(0) {}
+  explicit Data(int v) { value = v; }
+  int value;
+};
+
+bool operator==(const Data& lhs, const Data& rhs) {
+  return lhs.value == rhs.value;
+}
+
+std::ostream& operator<<(std::ostream& os, const Data& d) {
+  os << "Data: value=" << d.value;
+  return os;
+}
+
 template <typename T>
 static T* GenerateRandomData(const Index& size) {
   T* data = new T[size];
@@ -46,6 +71,23 @@ static T* GenerateRandomData(const Index& size) {
   return data;
 }
 
+template <>
+Data* GenerateRandomData(const Index& size) {
+  Data* data = new Data[size];
+  for (int i = 0; i < size; ++i) {
+    data[i] = Data(internal::random<int>(1, 100));
+  }
+  return data;
+}
+
+template <int NumDims>
+static void Debug(DSizes<Index, NumDims> dims) {
+  for (int i = 0; i < NumDims; ++i) {
+    std::cout << dims[i] << "; ";
+  }
+  std::cout << std::endl;
+}
+
 template <int Layout>
 static void test_block_mapper_sanity()
 {
@@ -96,7 +138,7 @@ static void test_block_mapper_sanity()
 // index in the visited set. Verify that every coeff accessed only once.
 template <typename T, int Layout, int NumDims>
 static void UpdateCoeffSet(
-    const internal::TensorBlock<T, Index, 4, Layout>& block,
+    const internal::TensorBlock<T, Index, NumDims, Layout>& block,
     Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
   const DSizes<Index, NumDims> block_sizes = block.block_sizes();
   const DSizes<Index, NumDims> tensor_strides = block.tensor_strides();
@@ -114,14 +156,13 @@ static void UpdateCoeffSet(
   }
 }
 
-template <int Layout>
-static void test_block_mapper_maps_every_element()
-{
-  using T = int;
-  using TensorBlock = internal::TensorBlock<T, Index, 4, Layout>;
-  using TensorBlockMapper = internal::TensorBlockMapper<T, Index, 4, Layout>;
+template <typename T, int NumDims, int Layout>
+static void test_block_mapper_maps_every_element() {
+  using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
+  using TensorBlockMapper =
+      internal::TensorBlockMapper<T, Index, NumDims, Layout>;
 
-  DSizes<Index, 4> dims(5, 7, 11, 17);
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>();
 
   // Keep track of elements indices available via block access.
   std::set<Index> coeff_set;
@@ -131,29 +172,36 @@ static void test_block_mapper_maps_every_element()
 
   for (int i = 0; i < block_mapper.total_block_count(); ++i) {
     TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
-    UpdateCoeffSet<T, Layout, 4>(block, block.first_coeff_index(),
-                                 choose(Layout, 3, 0), &coeff_set);
+    UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
+                                       choose(Layout, NumDims - 1, 0),
+                                       &coeff_set);
   }
 
   // Verify that every coefficient in the original Tensor is accessible through
   // TensorBlock only once.
-  auto total_coeffs = static_cast<int>(dims.TotalSize());
+  Index total_coeffs = dims.TotalSize();
   VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
-  VERIFY_IS_EQUAL(*coeff_set.begin(), static_cast<Index>(0));
-  VERIFY_IS_EQUAL(*coeff_set.rbegin(), static_cast<Index>(total_coeffs - 1));
+  VERIFY_IS_EQUAL(*coeff_set.begin(), 0);
+  VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
 }
 
-template <int Layout>
-static void test_slice_block_mapper_maps_every_element()
-{
-  using T = int;
-  using TensorBlock = internal::TensorBlock<T, Index, 4, Layout>;
+template <typename T, int NumDims, int Layout>
+static void test_slice_block_mapper_maps_every_element() {
+  using TensorBlock = internal::TensorBlock<T, Index, NumDims, Layout>;
   using TensorSliceBlockMapper =
-      internal::TensorSliceBlockMapper<T, Index, 4, Layout>;
+      internal::TensorSliceBlockMapper<T, Index, NumDims, Layout>;
 
-  DSizes<Index, 4> tensor_dims(5,7,11,17);
-  DSizes<Index, 4> tensor_slice_offsets(1,3,5,7);
-  DSizes<Index, 4> tensor_slice_extents(3,2,4,5);
+  DSizes<Index, NumDims> tensor_dims = RandomDims<NumDims>();
+  DSizes<Index, NumDims> tensor_slice_offsets = RandomDims<NumDims>();
+  DSizes<Index, NumDims> tensor_slice_extents = RandomDims<NumDims>();
+
+  // Make sure that tensor offsets + extents do not overflow.
+  for (int i = 0; i < NumDims; ++i) {
+    tensor_slice_offsets[i] =
+        numext::mini(tensor_dims[i] - 1, tensor_slice_offsets[i]);
+    tensor_slice_extents[i] = numext::mini(
+        tensor_slice_extents[i], tensor_dims[i] - tensor_slice_offsets[i]);
+  }
 
   // Keep track of elements indices available via block access.
   std::set<Index> coeff_set;
@@ -161,61 +209,59 @@ static void test_slice_block_mapper_maps_every_element()
   auto total_coeffs = static_cast<int>(tensor_slice_extents.TotalSize());
 
   // Pick a random dimension sizes for the tensor blocks.
-  DSizes<Index, 4> block_sizes;
-  for (int i = 0; i < 4; ++i) {
+  DSizes<Index, NumDims> block_sizes;
+  for (int i = 0; i < NumDims; ++i) {
     block_sizes[i] = internal::random<int>(1, tensor_slice_extents[i]);
   }
 
   TensorSliceBlockMapper block_mapper(tensor_dims, tensor_slice_offsets,
                                       tensor_slice_extents, block_sizes,
-                                      DimensionList<Index, 4>());
+                                      DimensionList<Index, NumDims>());
 
   for (int i = 0; i < block_mapper.total_block_count(); ++i) {
     TensorBlock block = block_mapper.GetBlockForIndex(i, nullptr);
-    UpdateCoeffSet<T, Layout, 4>(block, block.first_coeff_index(),
-                                 choose(Layout, 3, 0), &coeff_set);
+    UpdateCoeffSet<T, Layout, NumDims>(block, block.first_coeff_index(),
+                                       choose(Layout, NumDims - 1, 0),
+                                       &coeff_set);
   }
 
   VERIFY_IS_EQUAL(coeff_set.size(), total_coeffs);
 }
 
-template <int Layout>
-static void test_block_io_copy_data_from_source_to_target()
-{
-  using T = float;
-
-  typedef internal::TensorBlock<T, Index, 5, Layout> TensorBlock;
-  typedef internal::TensorBlockMapper<T, Index, 5, Layout> TensorBlockMapper;
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_data_from_source_to_target() {
+  typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
+  typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
+      TensorBlockMapper;
 
-  typedef internal::TensorBlockReader<T, Index, 5, Layout, true>
+  typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
       TensorBlockReader;
-  typedef internal::TensorBlockWriter<T, Index, 5, Layout, true>
+  typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
       TensorBlockWriter;
 
-  typedef std::vector<T, aligned_allocator<T>> DataVector;
-
-  DSizes<Index, 5> input_tensor_dims(5, 7, 11, 17, 3);
+  DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
   const auto input_tensor_size = input_tensor_dims.TotalSize();
-  DataVector input_data(input_tensor_size, 0);
-  for (int i = 0; i < input_tensor_size; ++i) {
-    input_data[i] = internal::random<T>();
-  }
 
-  DataVector output_data(input_tensor_size, 0);
+  T* input_data = GenerateRandomData<T>(input_tensor_size);
+  T* output_data = new T[input_tensor_size];
 
   TensorBlockMapper block_mapper(input_tensor_dims, RandomShape(),
                                  RandomTargetSize(input_tensor_dims));
+  T* block_data = new T[block_mapper.block_dims_total_size()];
 
-  DataVector block_data(block_mapper.block_dims_total_size(), 0);
   for (int i = 0; i < block_mapper.total_block_count(); ++i) {
-    TensorBlock block = block_mapper.GetBlockForIndex(i, block_data.data());
-    TensorBlockReader::Run(&block, input_data.data());
-    TensorBlockWriter::Run(block, output_data.data());
+    TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
+    TensorBlockReader::Run(&block, input_data);
+    TensorBlockWriter::Run(block, output_data);
   }
 
   for (int i = 0; i < input_tensor_size; ++i) {
     VERIFY_IS_EQUAL(input_data[i], output_data[i]);
   }
+
+  delete[] input_data;
+  delete[] output_data;
+  delete[] block_data;
 }
 
 template <int Layout, int NumDims>
@@ -261,31 +307,32 @@ static array<Index, NumDims> ComputeStrides(
   return strides;
 }
 
-template <int Layout>
+template <typename T, int NumDims, int Layout>
 static void test_block_io_copy_using_reordered_dimensions() {
-  typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
-  typedef internal::TensorBlockMapper<float, Index, 5, Layout>
+  typedef internal::TensorBlock<T, Index, NumDims, Layout> TensorBlock;
+  typedef internal::TensorBlockMapper<T, Index, NumDims, Layout>
       TensorBlockMapper;
 
-  typedef internal::TensorBlockReader<float, Index, 5, Layout, false>
+  typedef internal::TensorBlockReader<T, Index, NumDims, Layout>
       TensorBlockReader;
-  typedef internal::TensorBlockWriter<float, Index, 5, Layout, false>
+  typedef internal::TensorBlockWriter<T, Index, NumDims, Layout>
       TensorBlockWriter;
 
-  DSizes<Index, 5> input_tensor_dims(5, 7, 11, 17, 3);
+  DSizes<Index, NumDims> input_tensor_dims = RandomDims<NumDims>();
   const auto input_tensor_size = input_tensor_dims.TotalSize();
 
   // Create a random input tensor.
-  auto* input_data = GenerateRandomData<float>(input_tensor_size);
+  T* input_data = GenerateRandomData<T>(input_tensor_size);
 
   // Create a random dimension re-ordering/shuffle.
-  std::vector<Index> shuffle = {0, 1, 2, 3, 4};
+  std::vector<Index> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
   std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
 
-  DSizes<Index, 5> output_tensor_dims;
-  array<Index, 5> input_to_output_dim_map;
-  array<Index, 5> output_to_input_dim_map;
-  for (Index i = 0; i < 5; ++i) {
+  DSizes<Index, NumDims> output_tensor_dims;
+  array<Index, NumDims> input_to_output_dim_map;
+  array<Index, NumDims> output_to_input_dim_map;
+  for (Index i = 0; i < NumDims; ++i) {
     output_tensor_dims[shuffle[i]] = input_tensor_dims[i];
     input_to_output_dim_map[i] = shuffle[i];
     output_to_input_dim_map[shuffle[i]] = i;
@@ -295,17 +342,17 @@ static void test_block_io_copy_using_reordered_dimensions() {
   TensorBlockMapper block_mapper(output_tensor_dims, RandomShape(),
                                  RandomTargetSize(input_tensor_dims));
 
-  auto* block_data = new float[block_mapper.block_dims_total_size()];
-  auto* output_data = new float[input_tensor_size];
+  auto* block_data = new T[block_mapper.block_dims_total_size()];
+  auto* output_data = new T[input_tensor_size];
 
-  array<Index, 5> input_tensor_strides =
-      ComputeStrides<Layout, 5>(input_tensor_dims);
-  array<Index, 5> output_tensor_strides =
-      ComputeStrides<Layout, 5>(output_tensor_dims);
+  array<Index, NumDims> input_tensor_strides =
+      ComputeStrides<Layout, NumDims>(input_tensor_dims);
+  array<Index, NumDims> output_tensor_strides =
+      ComputeStrides<Layout, NumDims>(output_tensor_dims);
 
   for (Index i = 0; i < block_mapper.total_block_count(); ++i) {
     TensorBlock block = block_mapper.GetBlockForIndex(i, block_data);
-    const Index first_coeff_index = GetInputIndex<Layout, 5>(
+    const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
         block.first_coeff_index(), output_to_input_dim_map,
         input_tensor_strides, output_tensor_strides);
     TensorBlockReader::Run(&block, first_coeff_index, input_to_output_dim_map,
@@ -327,18 +374,21 @@ template <int Layout>
 static void test_block_io_zero_stride()
 {
   typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
-  typedef internal::TensorBlockReader<float, Index, 5, Layout, true>
+  typedef internal::TensorBlockReader<float, Index, 5, Layout>
       TensorBlockReader;
-  typedef internal::TensorBlockWriter<float, Index, 5, Layout, true>
+  typedef internal::TensorBlockWriter<float, Index, 5, Layout>
       TensorBlockWriter;
 
-  DSizes<Index, 5> input_tensor_dims(1, 2, 1, 3, 1);
-  const auto input_tensor_size = input_tensor_dims.TotalSize();
+  DSizes<Index, 5> rnd_dims = RandomDims<5>();
 
-  // Create a random input tensor.
+  DSizes<Index, 5> input_tensor_dims = rnd_dims;
+  input_tensor_dims[0] = 1;
+  input_tensor_dims[2] = 1;
+  input_tensor_dims[4] = 1;
+  const auto input_tensor_size = input_tensor_dims.TotalSize();
   auto* input_data = GenerateRandomData<float>(input_tensor_size);
 
-  DSizes<Index, 5> output_tensor_dims(3, 2, 3, 3, 2);
+  DSizes<Index, 5> output_tensor_dims = rnd_dims;
 
   DSizes<Index, 5> input_tensor_strides(
       ComputeStrides<Layout, 5>(input_tensor_dims));
@@ -401,9 +451,9 @@ static void test_block_io_zero_stride()
 template <int Layout>
 static void test_block_io_squeeze_ones() {
   typedef internal::TensorBlock<float, Index, 5, Layout> TensorBlock;
-  typedef internal::TensorBlockReader<float, Index, 5, Layout, true>
+  typedef internal::TensorBlockReader<float, Index, 5, Layout>
       TensorBlockReader;
-  typedef internal::TensorBlockWriter<float, Index, 5, Layout, true>
+  typedef internal::TensorBlockWriter<float, Index, 5, Layout>
       TensorBlockWriter;
 
   // Total size > 1.
@@ -467,23 +517,23 @@ static void test_block_io_squeeze_ones() {
   }
 }
 
-template <int Layout>
+template <typename T, int NumDims, int Layout>
 static void test_block_cwise_binary_io_basic() {
-  typedef internal::scalar_sum_op<float> BinaryFunctor;
-  typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, float, 5,
+  typedef internal::scalar_sum_op<T> BinaryFunctor;
+  typedef internal::TensorBlockCwiseBinaryIO<BinaryFunctor, Index, T, NumDims,
                                              Layout>
       TensorBlockCwiseBinaryIO;
 
-  DSizes<Index, 5> block_sizes(2, 3, 5, 7, 11);
-  DSizes<Index, 5> strides(ComputeStrides<Layout, 5>(block_sizes));
+  DSizes<Index, NumDims> block_sizes = RandomDims<NumDims>();
+  DSizes<Index, NumDims> strides(ComputeStrides<Layout, NumDims>(block_sizes));
 
   const auto total_size = block_sizes.TotalSize();
 
   // Create a random input tensors.
-  auto* left_data = GenerateRandomData<float>(total_size);
-  auto* right_data = GenerateRandomData<float>(total_size);
+  T* left_data = GenerateRandomData<T>(total_size);
+  T* right_data = GenerateRandomData<T>(total_size);
 
-  auto* output_data = new float[total_size];
+  T* output_data = new T[total_size];
   BinaryFunctor functor;
   TensorBlockCwiseBinaryIO::Run(functor, block_sizes, strides, output_data,
                                 strides, left_data, strides, right_data);
@@ -532,13 +582,22 @@ static void test_block_cwise_binary_io_zero_strides() {
                                              Layout>
       TensorBlockCwiseBinaryIO;
 
-  DSizes<Index, 5> left_sizes(1, 3, 1, 7, 1);
+  DSizes<Index, 5> rnd_dims = RandomDims<5>();
+
+  DSizes<Index, 5> left_sizes = rnd_dims;
+  left_sizes[0] = 1;
+  left_sizes[2] = 1;
+  left_sizes[4] = 1;
+
   DSizes<Index, 5> left_strides(ComputeStrides<Layout, 5>(left_sizes));
   left_strides[0] = 0;
   left_strides[2] = 0;
   left_strides[4] = 0;
 
-  DSizes<Index, 5> right_sizes(2, 1, 5, 1, 11);
+  DSizes<Index, 5> right_sizes = rnd_dims;
+  right_sizes[1] = 0;
+  right_sizes[3] = 0;
+
   DSizes<Index, 5> right_strides(ComputeStrides<Layout, 5>(right_sizes));
   right_strides[1] = 0;
   right_strides[3] = 0;
@@ -547,7 +606,7 @@ static void test_block_cwise_binary_io_zero_strides() {
   auto* left_data = GenerateRandomData<float>(left_sizes.TotalSize());
   auto* right_data = GenerateRandomData<float>(right_sizes.TotalSize());
 
-  DSizes<Index, 5> output_sizes(2, 3, 5, 7, 11);
+  DSizes<Index, 5> output_sizes = rnd_dims;
   DSizes<Index, 5> output_strides(ComputeStrides<Layout, 5>(output_sizes));
 
   const auto output_total_size = output_sizes.TotalSize();
@@ -557,11 +616,11 @@ static void test_block_cwise_binary_io_zero_strides() {
   TensorBlockCwiseBinaryIO::Run(functor, output_sizes, output_strides,
                                 output_data, left_strides, left_data,
                                 right_strides, right_data);
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
-          for (int m = 0; m < 11; ++m) {
+  for (int i = 0; i < rnd_dims[0]; ++i) {
+    for (int j = 0; j < rnd_dims[1]; ++j) {
+      for (int k = 0; k < rnd_dims[2]; ++k) {
+        for (int l = 0; l < rnd_dims[3]; ++l) {
+          for (int m = 0; m < rnd_dims[4]; ++m) {
             Index output_index = i * output_strides[0] + j * output_strides[1] +
                                  k * output_strides[2] + l * output_strides[3] +
                                  m * output_strides[4];
@@ -893,31 +952,44 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
   }
 }
 
-#define CALL_SUBTEST_LAYOUTS(NAME) \
+#define TEST_LAYOUTS(NAME) \
   CALL_SUBTEST(NAME<ColMajor>()); \
   CALL_SUBTEST(NAME<RowMajor>())
 
-#define CALL_SUBTEST_LAYOUTS_WITH_ARG(NAME, ARG) \
+#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME)    \
+  CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, RowMajor>()))
+
+#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \
   CALL_SUBTEST(NAME<ColMajor>(ARG)); \
   CALL_SUBTEST(NAME<RowMajor>(ARG))
 
 EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
-  CALL_SUBTEST_LAYOUTS(test_block_mapper_sanity);
-  CALL_SUBTEST_LAYOUTS(test_block_mapper_maps_every_element);
-  CALL_SUBTEST_LAYOUTS(test_slice_block_mapper_maps_every_element);
-  CALL_SUBTEST_LAYOUTS(test_block_io_copy_data_from_source_to_target);
-  CALL_SUBTEST_LAYOUTS(test_block_io_copy_using_reordered_dimensions);
-  CALL_SUBTEST_LAYOUTS(test_block_io_zero_stride);
-  CALL_SUBTEST_LAYOUTS(test_block_io_squeeze_ones);
-  CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_basic);
-  CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
-  CALL_SUBTEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
-  CALL_SUBTEST_LAYOUTS(test_uniform_block_shape);
-  CALL_SUBTEST_LAYOUTS(test_skewed_inner_dim_block_shape);
-
-  CALL_SUBTEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
-  CALL_SUBTEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
+  TEST_LAYOUTS(test_block_mapper_sanity);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
+  TEST_LAYOUTS_AND_DIMS(float, test_slice_block_mapper_maps_every_element);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_data_from_source_to_target);
+  TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_data_from_source_to_target);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_io_copy_using_reordered_dimensions);
+  TEST_LAYOUTS_AND_DIMS(Data, test_block_io_copy_using_reordered_dimensions);
+  TEST_LAYOUTS(test_block_io_zero_stride);
+  TEST_LAYOUTS(test_block_io_squeeze_ones);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_cwise_binary_io_basic);
+  TEST_LAYOUTS(test_block_cwise_binary_io_squeeze_ones);
+  TEST_LAYOUTS(test_block_cwise_binary_io_zero_strides);
+  TEST_LAYOUTS(test_uniform_block_shape);
+  TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
 }
 
-#undef CALL_SUBTEST_LAYOUTS
-#undef CALL_SUBTEST_LAYOUTS_WITH_ARG
\ No newline at end of file
+#undef TEST_LAYOUTS
+#undef TEST_LAYOUTS_WITH_ARG
\ No newline at end of file
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 5ae45ac5b..274f901ce 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -13,7 +13,6 @@
 
 #include <Eigen/CXX11/Tensor>
 
-using Eigen::Index;
 using Eigen::Tensor;
 using Eigen::RowMajor;
 using Eigen::ColMajor;
@@ -25,9 +24,16 @@ template <typename Device, bool Vectorizable, bool Tileable, int Layout>
 static void test_execute_binary_expr(Device d) {
   // Pick a large enough tensor size to bypass small tensor block evaluation
   // optimization.
-  Tensor<float, 3> lhs(840, 390, 37);
-  Tensor<float, 3> rhs(840, 390, 37);
-  Tensor<float, 3> dst(840, 390, 37);
+  int d0 = internal::random<int>(100, 200);
+  int d1 = internal::random<int>(100, 200);
+  int d2 = internal::random<int>(100, 200);
+
+  static constexpr int Options = 0;
+  using IndexType = int;
+
+  Tensor<float, 3, Options, IndexType> lhs(d0, d1, d2);
+  Tensor<float, 3, Options, IndexType> rhs(d0, d1, d2);
+  Tensor<float, 3, Options, IndexType> dst(d0, d1, d2);
 
   lhs.setRandom();
   rhs.setRandom();
@@ -40,9 +46,9 @@ static void test_execute_binary_expr(Device d) {
 
   Executor::run(Assign(dst, expr), d);
 
-  for (int i = 0; i < 840; ++i) {
-    for (int j = 0; j < 390; ++j) {
-      for (int k = 0; k < 37; ++k) {
+  for (int i = 0; i < d0; ++i) {
+    for (int j = 0; j < d1; ++j) {
+      for (int k = 0; k < d2; ++k) {
         float sum = lhs(i, j, k) + rhs(i, j, k);
         VERIFY_IS_EQUAL(sum, dst(i, j, k));
       }
-- 
cgit v1.2.3