From 6913221c43c6ad41b1fbfc0d263d2764abd11ad2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 25 Jul 2018 13:51:10 -0700
Subject: Add tiled evaluation support to TensorExecutor

---
 unsupported/Eigen/CXX11/Tensor                     |   2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h  |  44 +++-
 unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h   | 113 ++++++++-
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    |   2 +
 .../Eigen/CXX11/src/Tensor/TensorChipping.h        |   2 +
 .../Eigen/CXX11/src/Tensor/TensorConcatenation.h   |   2 +
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     |   1 +
 .../Eigen/CXX11/src/Tensor/TensorConversion.h      |   1 +
 .../Eigen/CXX11/src/Tensor/TensorConvolution.h     |  10 +-
 .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h |   1 +
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h      |  16 ++
 unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h  |   1 +
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       | 101 +++++++-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        | 262 ++++++++++++++++-----
 .../Eigen/CXX11/src/Tensor/TensorFixedSize.h       |   2 +
 .../Eigen/CXX11/src/Tensor/TensorForcedEval.h      |   1 +
 .../CXX11/src/Tensor/TensorForwardDeclarations.h   |   8 +-
 .../Eigen/CXX11/src/Tensor/TensorImagePatch.h      |   1 +
 .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h      |   2 +
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h        |   8 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h |   1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h   |   1 +
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       |   1 +
 unsupported/Eigen/CXX11/src/Tensor/TensorRef.h     |   3 +
 unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h |   2 +
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h       |   2 +
 .../Eigen/CXX11/src/Tensor/TensorStriding.h        |   2 +
 unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h   |   7 +-
 unsupported/test/CMakeLists.txt                    |   1 +
 unsupported/test/cxx11_tensor_block_access.cpp     |   2 +-
 .../test/cxx11_tensor_complex_cwise_ops_gpu.cu     |   2 +-
 unsupported/test/cxx11_tensor_complex_gpu.cu       |   2 +-
 unsupported/test/cxx11_tensor_executor.cpp         |  81 +++++++
 33 files changed, 596 insertions(+), 91 deletions(-)
 create mode 100644 unsupported/test/cxx11_tensor_executor.cpp
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 397d55f76..47514703a 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -112,13 +112,13 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorGlobalFunctions.h"
 
 #include "src/Tensor/TensorBase.h"
+#include "src/Tensor/TensorBlock.h"
 
 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
 #include "src/Tensor/TensorReduction.h"
 #include "src/Tensor/TensorReductionGpu.h"
 #include "src/Tensor/TensorArgMax.h"
-#include "src/Tensor/TensorBlock.h"
 #include "src/Tensor/TensorConcatenation.h"
 #include "src/Tensor/TensorContractionMapper.h"
 #include "src/Tensor/TensorContractionBlocking.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 027305586..199ddb123 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -68,6 +68,8 @@ class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType>
   typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
 
+  static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
       : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
 
@@ -95,20 +97,33 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int NumDims = XprType::NumDims;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
+    IsAligned    = TensorEvaluator<LeftArgType, Device>::IsAligned &
+                   TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                   TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess  = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+                   TensorEvaluator<RightArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess    = TensorEvaluator<LeftArgType, Device>::RawAccess
   };
 
+  typedef typename internal::TensorBlock<
+      typename internal::remove_const<Scalar>::type, Index, NumDims, Layout>
+      TensorBlock;
+
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(
+        (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+        YOU_MADE_A_PROGRAMMING_MISTAKE);
   }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -164,6 +179,25 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
            TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_leftImpl.getResourceRequirements(resources);
+    m_rightImpl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
+    if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
+        m_leftImpl.data() != nullptr) {
+      TensorBlock left_block(block->first_coeff_index(), block->block_sizes(),
+                             block->tensor_strides(), block->tensor_strides(),
+                             m_leftImpl.data() + block->first_coeff_index());
+      m_rightImpl.block(&left_block);
+    } else {
+      m_rightImpl.block(block);
+      m_leftImpl.writeBlock(*block);
+    }
+  }
+
   /// required by sycl in order to extract the accessor
   const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
   /// required by sycl in order to extract the accessor
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 8ffc9d093..5321acecf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -65,6 +65,40 @@ enum class TensorBlockShapeType {
   kSkewedInnerDims,
 };
 
+struct TensorOpResourceRequirements {
+  TensorBlockShapeType block_shape;
+  std::size_t block_total_size;
+  // TODO(andydavis) Add 'target_num_threads' to support communication of
+  // thread-resource requirements. This will allow ops deep in the
+  // expression tree (like reductions) to communicate resources
+  // requirements based on local state (like the total number of reductions
+  // to be computed).
+  TensorOpResourceRequirements(internal::TensorBlockShapeType shape,
+                               const std::size_t size)
+      : block_shape(shape), block_total_size(size) {}
+};
+
+// Tries to merge multiple resource requirements.
+EIGEN_STRONG_INLINE void MergeResourceRequirements(
+    const std::vector<TensorOpResourceRequirements>& resources,
+    TensorBlockShapeType* block_shape, std::size_t* block_total_size) {
+  if (resources.empty()) {
+    return;
+  }
+  // TODO(andydavis) Implement different policies (i.e. revert to a default
+  // policy if block shapes/sizes conflict).
+  *block_shape = resources[0].block_shape;
+  *block_total_size = resources[0].block_total_size;
+  for (int i = 1; i < resources.size(); ++i) {
+    if (resources[i].block_shape == TensorBlockShapeType::kSkewedInnerDims &&
+        *block_shape != TensorBlockShapeType::kSkewedInnerDims) {
+      *block_shape = TensorBlockShapeType::kSkewedInnerDims;
+    }
+    *block_total_size =
+        numext::maxi(*block_total_size, resources[i].block_total_size);
+  }
+}
+
 /**
  * \class TensorBlock
  * \ingroup CXX11_Tensor_Module
@@ -74,7 +108,7 @@ enum class TensorBlockShapeType {
  * This class represents a tensor block specified by the index of the
  * first block coefficient, and the size of the block in each dimension.
  */
-template <typename Scalar, typename Index, std::size_t NumDims, int Layout>
+template <typename Scalar, typename Index, int NumDims, int Layout>
 class TensorBlock {
  public:
   typedef DSizes<Index, NumDims> Dimensions;
@@ -614,6 +648,83 @@ struct TensorBlockCwiseBinaryIO {
   }
 };
 
+/**
+ * \class TensorBlockView
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Read-only view into a block of data.
+ *
+ * This class provides read-only access to a block of data in impl. It may need
+ * to allocate space for holding the intermediate result.
+ *
+ */
+template <class ArgType, class Device>
+struct TensorBlockView {
+  typedef TensorEvaluator<ArgType, Device> Impl;
+  typedef typename Impl::Index Index;
+  typedef typename remove_const<typename Impl::Scalar>::type Scalar;
+  static const int NumDims = array_size<typename Impl::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // Constructs a TensorBlockView for `impl`. `block` is only used for for
+  // specifying the start offset, shape, and strides of the block.
+  template <typename OtherTensorBlock>
+  TensorBlockView(const Device& device,
+                  const TensorEvaluator<ArgType, Device>& impl,
+                  const OtherTensorBlock& block)
+      : m_device(device),
+        m_block_sizes(block.block_sizes()),
+        m_data(NULL),
+        m_allocated_data(NULL) {
+    if (Impl::RawAccess && impl.data() != NULL) {
+      m_data = impl.data() + block.first_coeff_index();
+      m_block_strides = block.tensor_strides();
+    } else {
+      // Actually make a copy.
+
+      // TODO(wuke): This sometimes put a lot pressure on the heap allocator.
+      // Consider allowing ops to request additional temporary block memory in
+      // TensorOpResourceRequirements.
+      m_allocated_data = static_cast<Scalar*>(
+          m_device.allocate(m_block_sizes.TotalSize() * sizeof(Scalar)));
+      m_data = m_allocated_data;
+      if (NumDims > 0) {
+        if (static_cast<int>(Impl::Layout) == static_cast<int>(ColMajor)) {
+          m_block_strides[0] = 1;
+          for (int i = 1; i < NumDims; ++i) {
+            m_block_strides[i] = m_block_strides[i - 1] * m_block_sizes[i - 1];
+          }
+        } else {
+          m_block_strides[NumDims - 1] = 1;
+          for (int i = NumDims - 2; i >= 0; --i) {
+            m_block_strides[i] = m_block_strides[i + 1] * m_block_sizes[i + 1];
+          }
+        }
+      }
+      TensorBlock<Scalar, Index, NumDims, Impl::Layout> input_block(
+          block.first_coeff_index(), m_block_sizes, m_block_strides,
+          block.tensor_strides(), m_allocated_data);
+      impl.block(&input_block);
+    }
+  }
+
+  ~TensorBlockView() {
+    if (m_allocated_data != NULL) {
+      m_device.deallocate(m_allocated_data);
+    }
+  }
+
+  const Dimensions& block_sizes() const { return m_block_sizes; }
+  const Dimensions& block_strides() const { return m_block_strides; }
+  const Scalar* data() const { return m_data; }
+
+ private:
+  const Device& m_device;
+  Dimensions m_block_sizes, m_block_strides;
+  const Scalar* m_data;      // Not owned.
+  Scalar* m_allocated_data;  // Owned.
+};
+
 /**
  * \class TensorBlockMapper
  * \ingroup CXX11_Tensor_Module
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 278689915..7ff0d323b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -1,4 +1,5 @@
 // This file is part of Eigen, a lightweight C++ template library
+// This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
@@ -110,6 +111,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   enum {
     IsAligned = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 21ffa2872..085c05f3d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -146,6 +146,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     // slice offsets.
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -343,6 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     RawAccess = false
   };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index a7c1380b8..9f0321880 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -122,6 +122,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     RawAccess = false
   };
@@ -306,6 +307,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     RawAccess = false
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 0e69cd40c..12cfa8df3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -240,6 +240,7 @@ struct TensorContractionEvaluatorBase
   enum {
     IsAligned = true,
     PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = false,
     Layout = TensorEvaluator<LeftArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 182bef918..e0cbbb315 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -195,6 +195,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = true,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 25131600d..1ec5819a7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -307,6 +307,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   enum {
     IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<InputArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -577,11 +578,11 @@ __global__ void EigenConvolutionKernel1D(
     const float* __restrict kernel, const int numPlanes, const int numX,
     const int maxX, const int kernelSize, float* buffer) {
 #if defined(EIGEN_HIPCC)
-  HIP_DYNAMIC_SHARED(float, s)	    
+  HIP_DYNAMIC_SHARED(float, s)
 #else
   extern __shared__ float s[];
 #endif
-  
+
   const int first_x = blockIdx.x * maxX;
   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
@@ -630,7 +631,7 @@ __global__ void EigenConvolutionKernel2D(
     const int maxX, const int numY, const int maxY, const int kernelSizeX,
     const int kernelSizeY, float* buffer) {
 #if defined(EIGEN_HIPCC)
-  HIP_DYNAMIC_SHARED(float, s)	    
+  HIP_DYNAMIC_SHARED(float, s)
 #else
   extern __shared__ float s[];
 #endif
@@ -702,7 +703,7 @@ __global__ void EigenConvolutionKernel3D(
     const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
     const size_t kernelSizeZ, float* buffer) {
 #if defined(EIGEN_HIPCC)
-  HIP_DYNAMIC_SHARED(float, s)	    
+  HIP_DYNAMIC_SHARED(float, s)
 #else
   extern __shared__ float s[];
 #endif
@@ -778,6 +779,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   enum {
     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
     PacketAccess = false,
+    BlockAccess = false,
     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 65403905a..d301d0c01 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -242,6 +242,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   enum {
     IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
     PacketAccess = false,
+    BlockAccess = false,
     Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 192d4aa7b..5ca47cca7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -290,6 +290,22 @@ struct DSizes : array<DenseIndex, NumDims> {
     }
   }
 
+#ifndef EIGEN_EMULATE_CXX11_META_H
+  template <typename std::ptrdiff_t... Indices>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+#else
+  template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+#endif
+
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   template<typename... IndexTypes> EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index d0c027890..af39daa91 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -107,6 +107,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index fe62ff1ea..ba02802d2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -41,11 +41,24 @@ struct TensorEvaluator
   enum {
     IsAligned = Derived::IsAligned,
     PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
     Layout = Derived::Layout,
     CoordAccess = NumCoords > 0,
     RawAccess = true
   };
 
+  typedef typename internal::TensorBlock<
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+      TensorBlock;
+  typedef typename internal::TensorBlockReader<
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
+      PacketAccess>
+      TensorBlockReader;
+  typedef typename internal::TensorBlockWriter<
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
+      PacketAccess>
+      TensorBlockWriter;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
       : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
   { }
@@ -113,6 +126,20 @@ struct TensorEvaluator
                         internal::unpacket_traits<PacketReturnType>::size);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+    assert(m_data != NULL);
+    TensorBlockReader::Run(block, m_data);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlock& block) {
+    assert(m_data != NULL);
+    TensorBlockWriter::Run(block, m_data);
+  }
+
   EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
 
   /// required by sycl in order to construct sycl buffer from raw pointer
@@ -167,11 +194,20 @@ struct TensorEvaluator<const Derived, Device>
   enum {
     IsAligned = Derived::IsAligned,
     PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
     Layout = Derived::Layout,
     CoordAccess = NumCoords > 0,
     RawAccess = true
   };
 
+  typedef typename internal::TensorBlock<
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+      TensorBlock;
+  typedef typename internal::TensorBlockReader<
+      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout,
+      PacketAccess>
+      TensorBlockReader;
+
   // Used for accessor extraction in SYCL Managed TensorMap:
   const Derived& derived() const { return m_impl; }
 
@@ -219,6 +255,14 @@ struct TensorEvaluator<const Derived, Device>
                         internal::unpacket_traits<PacketReturnType>::size);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+    assert(m_data != NULL);
+    TensorBlockReader::Run(block, m_data);
+  }
+
   EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
 
   /// added for sycl in order to construct the buffer from the sycl device
@@ -244,6 +288,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   enum {
     IsAligned = true,
     PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -308,7 +353,9 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
 
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
+                   internal::functor_traits<UnaryOp>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -375,16 +422,21 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
+    IsAligned    = TensorEvaluator<LeftArgType, Device>::IsAligned &
+                   TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &
+                   TensorEvaluator<RightArgType, Device>::PacketAccess &
                    internal::functor_traits<BinaryOp>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    BlockAccess  = TensorEvaluator<LeftArgType, Device>::BlockAccess &
+                   TensorEvaluator<RightArgType, Device>::BlockAccess,
+    Layout       = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess  = false,  // to be implemented
+    RawAccess    = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-    : m_functor(op.functor()),
+    : m_device(device),
+      m_functor(op.functor()),
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
@@ -399,6 +451,14 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
 
+  static const int NumDims = internal::array_size<
+      typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+
+  typedef internal::TensorBlock<
+      typename internal::remove_const<Scalar>::type, Index, NumDims,
+      TensorEvaluator<LeftArgType, Device>::Layout>
+      TensorBlock;
+
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
     // TODO: use right impl instead if right impl dimensions are known at compile time.
@@ -433,6 +493,30 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
            TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_leftImpl.getResourceRequirements(resources);
+    m_rightImpl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+      TensorBlock* output_block) const {
+    if (NumDims <= 0) {
+      output_block->data()[0] = coeff(0);
+      return;
+    }
+    internal::TensorBlockView<LeftArgType, Device> left_block(
+        m_device, m_leftImpl, *output_block);
+    internal::TensorBlockView<RightArgType, Device> right_block(
+        m_device, m_rightImpl, *output_block);
+    internal::TensorBlockCwiseBinaryIO<
+        BinaryOp, Index, typename internal::remove_const<Scalar>::type, NumDims,
+        Layout>::Run(m_functor, output_block->block_sizes(),
+                     output_block->block_strides(), output_block->data(),
+                     left_block.block_strides(), left_block.data(),
+                     right_block.block_strides(), right_block.data());
+  }
+
   EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; }
   /// required by sycl in order to extract the accessor
   const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
@@ -442,6 +526,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   BinaryOp functor() const { return m_functor; }
 
  private:
+  const Device& m_device;
   const BinaryOp m_functor;
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
   TensorEvaluator<RightArgType, Device> m_rightImpl;
@@ -458,6 +543,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
     IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
     PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
                    internal::functor_traits<TernaryOp>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<Arg1Type, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -562,6 +648,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
                    internal::packet_traits<Scalar>::HasBlend,
+    BlockAccess = false,
     Layout = TensorEvaluator<IfArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 53640c6aa..024de3696 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -12,29 +12,37 @@
 
 namespace Eigen {
 
-/** \class TensorExecutor
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief The tensor executor class.
-  *
-  * This class is responsible for launch the evaluation of the expression on
-  * the specified computing device.
-  */
+/**
+ * \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ *
+ * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
+ *                      instructions)
+ * @tparam Tileable     can use block based tensor evaluation
+ *                      (see TensorBlock.h)
+ */
 namespace internal {
 
-// Default strategy: the expression is evaluated with a single cpu thread.
-template<typename Expression, typename Device, bool Vectorizable>
-class TensorExecutor
-{
+/**
+ * Default strategy: the expression is evaluated sequentially with a single cpu
+ * thread, without vectorization and block evaluation.
+ */
+template <typename Expression, typename Device, bool Vectorizable,
+          bool Tileable>
+class TensorExecutor {
  public:
   typedef typename Expression::Index Index;
   EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr, const Device& device = Device())
-  {
+  static inline void run(const Expression& expr,
+                         const Device& device = Device()) {
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
+    if (needs_assign) {
       const Index size = array_prod(evaluator.dimensions());
       for (Index i = 0; i < size; ++i) {
         evaluator.evalScalar(i);
@@ -44,12 +52,14 @@ class TensorExecutor
   }
 };
 
-
-template<typename Expression>
-class TensorExecutor<Expression, DefaultDevice, true>
-{
+/**
+ * Process all the data with a single cpu thread, using vectorized instructions.
+ */
+template <typename Expression>
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable*/ false> {
  public:
   typedef typename Expression::Index Index;
+
   EIGEN_DEVICE_FUNC
   static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
   {
@@ -58,9 +68,11 @@ class TensorExecutor<Expression, DefaultDevice, true>
     if (needs_assign)
     {
       const Index size = array_prod(evaluator.dimensions());
-      const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
-      // Give the compiler a strong hint to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive the compiler should not
+      const int PacketSize = unpacket_traits<typename TensorEvaluator<
+          Expression, DefaultDevice>::PacketReturnType>::size;
+
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
       const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
       for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
@@ -80,9 +92,75 @@ class TensorExecutor<Expression, DefaultDevice, true>
   }
 };
 
+/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> {
+ public:
+  typedef typename Expression::Index Index;
+
+  EIGEN_DEVICE_FUNC
+  static inline void run(const Expression& expr,
+                         const DefaultDevice& device = DefaultDevice()) {
+    using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
 
+    using Index = typename traits<Expression>::Index;
+    const int NumDims = traits<Expression>::NumDimensions;
 
-// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+    using Scalar = typename traits<Expression>::Scalar;
+    using ScalarNoConst = typename remove_const<Scalar>::type;
+
+    using TensorBlock =
+        TensorBlock<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
+    using TensorBlockMapper =
+        TensorBlockMapper<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
+
+    Evaluator evaluator(expr, device);
+    std::size_t total_size = array_prod(evaluator.dimensions());
+    std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+
+    if (total_size < cache_size) {
+      // TODO(andydavis) Reduce block management overhead for small tensors.
+      // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
+      internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                               false>::run(expr, device);
+      return;
+    }
+
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      // Size tensor blocks to fit in cache (or requested target block size).
+      size_t block_total_size = numext::mini(cache_size, total_size);
+      TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
+      // Query expression tree for desired block size/shape.
+      std::vector<TensorOpResourceRequirements> resources;
+      evaluator.getResourceRequirements(&resources);
+      MergeResourceRequirements(resources, &block_shape, &block_total_size);
+
+      TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
+                                     block_total_size);
+      block_total_size = block_mapper.block_dims_total_size();
+
+      Scalar* data = static_cast<Scalar*>(
+          device.allocate(block_total_size * sizeof(Scalar)));
+
+      const Index total_block_count = block_mapper.total_block_count();
+      for (Index i = 0; i < total_block_count; ++i) {
+        TensorBlock block = block_mapper.GetBlockForIndex(i, data);
+        evaluator.evalBlock(&block);
+      }
+      device.deallocate(data);
+    }
+    evaluator.cleanup();
+  }
+};
+
+/**
+ * Multicore strategy: the index space is partitioned and each partition is
+ * executed on a single core.
+ */
 #ifdef EIGEN_USE_THREADS
 template <typename Evaluator, typename Index, bool Vectorizable>
 struct EvalRange {
@@ -100,7 +178,7 @@ struct EvalRange {
 };
 
 template <typename Evaluator, typename Index>
-struct EvalRange<Evaluator, Index, true> {
+struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
   static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
 
   static void run(Evaluator* evaluator_in, const Index first, const Index last) {
@@ -110,8 +188,8 @@ struct EvalRange<Evaluator, Index, true> {
     if (last - first >= PacketSize) {
       eigen_assert(first % PacketSize == 0);
       Index last_chunk_offset = last - 4 * PacketSize;
-      // Give the compiler a strong hint to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive the compiler should not
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
       for (; i <= last_chunk_offset; i += 4*PacketSize) {
         for (Index j = 0; j < 4; j++) {
@@ -138,55 +216,113 @@ struct EvalRange<Evaluator, Index, true> {
   }
 };
 
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, bool Tileable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
  public:
   typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
-  {
+
+  static inline void run(const Expression& expr,
+                         const ThreadPoolDevice& device) {
     typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+    typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange;
+
     Evaluator evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const Index PacketSize =
+          Vectorizable
+              ? unpacket_traits<typename Evaluator::PacketReturnType>::size
+              : 1;
       const Index size = array_prod(evaluator.dimensions());
-      size_t num_threads = device.numThreads();
-      if (num_threads > 1) {
-        num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
-            size, evaluator.costPerCoeff(Vectorizable), num_threads);
-      }
-      if (num_threads == 1) {
-        EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
-      } else {
-        const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
-        Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
-        const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
-        const Index numblocks = size / blocksize;
-
-        Barrier barrier(numblocks);
-        for (int i = 0; i < numblocks; ++i) {
-          device.enqueue_with_barrier(
-              &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
-              &evaluator, i * blocksize, (i + 1) * blocksize);
-        }
-        if (numblocks * blocksize < size) {
-          EvalRange<Evaluator, Index, Vectorizable>::run(
-              &evaluator, numblocks * blocksize, size);
-        }
-        barrier.Wait();
-      }
+      device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+                         EvalRange::alignBlockSize,
+                         [&evaluator](Index first, Index last) {
+                           EvalRange::run(&evaluator, first, last);
+                         });
+    }
+    evaluator.cleanup();
+  }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
+ public:
+  typedef typename Expression::Index Index;
+
+  static inline void run(const Expression& expr,
+                         const ThreadPoolDevice& device) {
+    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+    typedef typename internal::remove_const<
+        typename traits<Expression>::Scalar>::type Scalar;
+    typedef typename traits<Expression>::Index Index;
+
+    static const int NumDims = traits<Expression>::NumDimensions;
+
+    typedef TensorBlock<Scalar, Index, NumDims, Evaluator::Layout> TensorBlock;
+    typedef TensorBlockMapper<Scalar, Index, NumDims, Evaluator::Layout>
+        TensorBlockMapper;
+
+    Evaluator evaluator(expr, device);
+    std::size_t total_size = array_prod(evaluator.dimensions());
+    std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+    if (total_size < cache_size) {
+      // TODO(andydavis) Reduce block management overhead for small tensors.
+      internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+                               false>::run(expr, device);
+      evaluator.cleanup();
+      return;
+    }
+
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
+      size_t block_total_size = 0;
+      // Query expression tree for desired block size/shape.
+      std::vector<internal::TensorOpResourceRequirements> resources;
+      evaluator.getResourceRequirements(&resources);
+      MergeResourceRequirements(resources, &block_shape, &block_total_size);
+      int num_threads = device.numThreads();
+
+      // Estimate minimum block size based on cost.
+      TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
+      double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
+      size_t block_size = static_cast<size_t>(1.0 / taskSize);
+      TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
+                                     block_size);
+      block_size = block_mapper.block_dims_total_size();
+      const size_t aligned_blocksize =
+          EIGEN_MAX_ALIGN_BYTES *
+          divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES);
+      void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
+      device.parallelFor(
+          block_mapper.total_block_count(), cost * block_size,
+          [=, &device, &evaluator, &block_mapper](Index first, Index last) {
+            // currentThreadId() returns -1 if called from a thread not in the
+            // threadpool, such as the main thread dispatching Eigen
+            // expressions.
+            const int thread_idx = device.currentThreadId();
+            eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
+            Scalar* thread_buf = reinterpret_cast<Scalar*>(
+                static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
+            for (Index i = first; i < last; ++i) {
+              auto block = block_mapper.GetBlockForIndex(i, thread_buf);
+              evaluator.evalBlock(&block);
+            }
+          });
+      device.deallocate(buf);
     }
     evaluator.cleanup();
   }
 };
+
 #endif  // EIGEN_USE_THREADS
 
 
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU)
 
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, GpuDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, bool Tileable>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
  public:
   typedef typename Expression::Index Index;
   static void run(const Expression& expr, const GpuDevice& device);
@@ -236,8 +372,8 @@ EigenMetaKernel(Evaluator eval, Index size) {
 }
 
 /*static*/
-template <typename Expression, bool Vectorizable>
-inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
+template <typename Expression, bool Vectorizable, bool Tileable>
+inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
     const Expression& expr, const GpuDevice& device) {
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index e943757ad..1342e47a6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -40,6 +40,8 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
 
     enum {
       IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+      BlockAccess = false,
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
       RawAccess = true
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index b8f0bc798..fdb31928f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -98,6 +98,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
   enum {
     IsAligned = true,
     PacketAccess = (PacketSize > 1),
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = true
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 19e456e19..8ed1796df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -129,8 +129,14 @@ struct IsVectorizable<GpuDevice, Expression> {
                             TensorEvaluator<Expression, GpuDevice>::IsAligned;
 };
 
+template <typename Device, typename Expression>
+struct IsTileable {
+  static const bool value = TensorEvaluator<Expression, Device>::BlockAccess;
+};
+
 template <typename Expression, typename Device,
-          bool Vectorizable = IsVectorizable<Device, Expression>::value>
+          bool Vectorizable = IsVectorizable<Device, Expression>::value,
+          bool Tileable = IsTileable<Device, Expression>::value>
 class TensorExecutor;
 
 }  // end namespace internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index f0f7c7826..72cb2d15f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -186,6 +186,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 4e384f9b9..e3165fa10 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -119,6 +119,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -181,6 +182,7 @@ template<typename ArgType, typename Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false  // to be implemented
   };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index cda49f8fe..498488649 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -105,6 +105,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -170,6 +171,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -325,6 +327,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     // slice offsets and sizes.
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
@@ -557,6 +560,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
@@ -716,7 +720,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   static const int NumDims = internal::array_size<Strides>::value;
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Strides Dimensions;
@@ -858,7 +861,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     }
     return inputIndex;
   }
-  
+
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
 #ifndef __SYCL_DEVICE_ONLY__
     return numext::maxi(min, numext::mini(max,value));
@@ -907,7 +910,6 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Strides Dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 5956e513d..ffa22f31e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -96,6 +96,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   enum {
     IsAligned = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = true,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 9e0a20abf..950ac32af 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index ce573d730..375fc0802 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -412,6 +412,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   enum {
     IsAligned = false,
     PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index b2b4fd8d3..a6cade50f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -136,6 +136,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
     enum {
       IsAligned = false,
       PacketAccess = false,
+      BlockAccess = false,
       Layout = PlainObjectType::Layout,
       CoordAccess = false,  // to be implemented
       RawAccess = false
@@ -364,6 +365,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
   enum {
     IsAligned = false,
     PacketAccess = false,
+    BlockAccess = false,
     Layout = TensorRef<Derived>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -411,6 +413,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
   enum {
     IsAligned = false,
     PacketAccess = false,
+    BlockAccess = false,
     RawAccess = false
   };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 14a50a029..bb2768ab1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -113,6 +113,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -253,6 +254,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 0697fd1ce..6b54f40ad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -240,6 +241,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
     RawAccess = false
   };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index a7eea99b6..c09513c10 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -112,6 +112,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -273,6 +274,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index 2b1968de1..c8b2fad1e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -95,6 +95,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
@@ -110,7 +111,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
     for (int i = 0; i < NumInputDims; ++i) {
       m_reduced[i] = false;
     }
-    
+
     const Dims& op_dims = op.dims();
     for (int i = 0; i < NumReducedDims; ++i) {
       eigen_assert(op_dims[i] >= 0);
@@ -128,7 +129,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
 
     eigen_assert(num_distinct_reduce_dims == NumReducedDims);
 
-    // Compute the dimensions of the result. 
+    // Compute the dimensions of the result.
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
 
     int output_index = 0;
@@ -229,7 +230,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
         result += m_impl.coeff(cur_index);
         cur_index += index_stride;
     }
-      
+
     return result;
   }
 
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index fa19b2159..239a80926 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -213,6 +213,7 @@ if(EIGEN_TEST_CXX11)
   ei_add_test(cxx11_tensor_striding)
   ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_ref)
   ei_add_test(cxx11_tensor_random)
   ei_add_test(cxx11_tensor_generator)
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
index 15f2392a3..416b686e4 100644
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -901,7 +901,7 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
   CALL_SUBTEST(NAME<ColMajor>(ARG)); \
   CALL_SUBTEST(NAME<RowMajor>(ARG))
 
-EIGEN_DECLARE_TEST(cxx11_tensor_assign) {
+EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
   CALL_SUBTEST_LAYOUTS(test_block_mapper_sanity);
   CALL_SUBTEST_LAYOUTS(test_block_mapper_maps_every_element);
   CALL_SUBTEST_LAYOUTS(test_slice_block_mapper_maps_every_element);
diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
index aa28457b1..f2a2a6cfa 100644
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
@@ -93,7 +93,7 @@ void test_cuda_complex_cwise_ops() {
 }
 
 
-void test_cxx11_tensor_complex_cwise_ops()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops)
 {
   CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
   CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
diff --git a/unsupported/test/cxx11_tensor_complex_gpu.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu
index 7cf06aa7a..f8b8ae704 100644
--- a/unsupported/test/cxx11_tensor_complex_gpu.cu
+++ b/unsupported/test/cxx11_tensor_complex_gpu.cu
@@ -177,7 +177,7 @@ static void test_cuda_product_reductions() {
 }
 
 
-void test_cxx11_tensor_complex()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
 {
   CALL_SUBTEST(test_cuda_nullary());
   CALL_SUBTEST(test_cuda_sum_reductions());
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
new file mode 100644
index 000000000..5ae45ac5b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Index;
+using Eigen::Tensor;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+
+// A set of tests to verify that different TensorExecutor strategies yields the
+// same results for all the ops, supporting tiled execution.
+
+template <typename Device, bool Vectorizable, bool Tileable, int Layout>
+static void test_execute_binary_expr(Device d) {
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  Tensor<float, 3> lhs(840, 390, 37);
+  Tensor<float, 3> rhs(840, 390, 37);
+  Tensor<float, 3> dst(840, 390, 37);
+
+  lhs.setRandom();
+  rhs.setRandom();
+
+  const auto expr = lhs + rhs;
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (int i = 0; i < 840; ++i) {
+    for (int j = 0; j < 390; ++j) {
+      for (int k = 0; k < 37; ++k) {
+        float sum = lhs(i, j, k) + rhs(i, j, k);
+        VERIFY_IS_EQUAL(sum, dst(i, j, k));
+      }
+    }
+  }
+}
+
+#define CALL_SUBTEST_COMBINATIONS(NAME)                                        \
+  CALL_SUBTEST((NAME<DefaultDevice, false, false, ColMajor>(default_device))); \
+  CALL_SUBTEST((NAME<DefaultDevice, false, true, ColMajor>(default_device)));  \
+  CALL_SUBTEST((NAME<DefaultDevice, true, false, ColMajor>(default_device)));  \
+  CALL_SUBTEST((NAME<DefaultDevice, true, true, ColMajor>(default_device)));   \
+  CALL_SUBTEST((NAME<DefaultDevice, false, false, RowMajor>(default_device))); \
+  CALL_SUBTEST((NAME<DefaultDevice, false, true, RowMajor>(default_device)));  \
+  CALL_SUBTEST((NAME<DefaultDevice, true, false, RowMajor>(default_device)));  \
+  CALL_SUBTEST((NAME<DefaultDevice, true, true, RowMajor>(default_device)));   \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, ColMajor>(tp_device)));   \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, ColMajor>(tp_device)));    \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, ColMajor>(tp_device)));    \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, ColMajor>(tp_device)));     \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, false, false, RowMajor>(tp_device)));   \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, false, true, RowMajor>(tp_device)));    \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, true, false, RowMajor>(tp_device)));    \
+  CALL_SUBTEST((NAME<ThreadPoolDevice, true, true, RowMajor>(tp_device)))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
+  Eigen::DefaultDevice default_device;
+
+  const auto num_threads = internal::random<int>(1, 24);
+  Eigen::ThreadPool tp(num_threads);
+  Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
+
+  CALL_SUBTEST_COMBINATIONS(test_execute_binary_expr);
+}
+
+#undef CALL_SUBTEST_COMBINATIONS
-- 
cgit v1.2.3