From d380c23b2cc0b02e10819e779c73cde2c62603b2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 14 Oct 2019 14:31:59 -0700
Subject: Block evaluation for TensorGenerator/TensorReverse/TensorShuffling

---
 unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h  |   5 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h | 112 ++++++----
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    |   3 +-
 .../Eigen/CXX11/src/Tensor/TensorChipping.h        |   5 +-
 .../Eigen/CXX11/src/Tensor/TensorConversion.h      |   3 +-
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h      |   2 +-
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       |  15 +-
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        |  13 --
 .../Eigen/CXX11/src/Tensor/TensorForcedEval.h      |   3 +-
 .../Eigen/CXX11/src/Tensor/TensorGenerator.h       |   4 +-
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h        |  33 ++-
 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h |   5 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h |   7 +-
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h       | 147 +++++++++++-
 unsupported/test/cxx11_tensor_block_eval.cpp       | 246 ++++++++++++++++-----
 unsupported/test/cxx11_tensor_executor.cpp         | 131 ++++++-----
 16 files changed, 546 insertions(+), 188 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 29aa7a97e..f2b9389c8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -242,9 +242,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
           (internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar)));
     }
 
-    RightTensorBlock block = m_rightImpl.blockV2(desc, scratch);
-    // If block was evaluated into a destination, there is no need to do
-    // assignment.
+    RightTensorBlock block = m_rightImpl.blockV2(desc, scratch, /*root_of_expr_ast=*/true);
+    // If block was evaluated into a destination, there is no need to do assignment.
     if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
       m_leftImpl.writeBlockV2(desc, block);
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
index b8c592543..099d7cd57 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
@@ -45,6 +45,12 @@ EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
   return strides;
 }
 
+template<int Layout, typename IndexType, size_t NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+    const Eigen::array<IndexType, NumDims>& dimensions) {
+  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
+}
+
 #if EIGEN_HAS_CXX11
 template <int Layout, std::ptrdiff_t... Indices>
 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
@@ -78,23 +84,24 @@ class TensorBlockDescriptor {
       return static_cast<Scalar*>(m_data);
     }
 
-   private:
-    friend class TensorBlockDescriptor;
-
-    DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}
+    template <typename Scalar>
+    Dimensions dimensions() const {
+      Dimensions dimensions;
+      for (int i = 0; i < NumDims; ++i) {
+        eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0);
+        dimensions[i] = m_dimensions[i] / sizeof(Scalar);
+      }
+      return dimensions;
+    }
 
     template <typename Scalar>
-    DestinationBuffer(Scalar* data, const Dimensions& dimensions,
-                      const Dimensions& strides, size_t total_dst_bytes)
-        : m_data(static_cast<void*>(data)),
-          m_dimensions(dimensions),
-          m_strides(strides),
-          m_total_dst_bytes(total_dst_bytes) {
-      // TODO(ezhulenev): Benchmark template meta-unroll for this loop.
+    Dimensions strides() const {
+      Dimensions strides;
       for (int i = 0; i < NumDims; ++i) {
-        m_dimensions[i] *= sizeof(Scalar);
-        m_strides[i] *= sizeof(Scalar);
+        eigen_assert(m_strides[i] % sizeof(Scalar) == 0);
+        strides[i] = m_strides[i] / sizeof(Scalar);
       }
+      return strides;
     }
 
     // Returns true if the tensor block corresponding to `desc` fits into the
@@ -109,29 +116,34 @@ class TensorBlockDescriptor {
       if (!dimensions_match(desc_dims, dst_dims)) return false;
 
       const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
-      const Dimensions& dst_strides = internal::strides<Layout>(dst_dims);
+      const Dimensions& dst_strides = strides<Scalar>();
 
-      return dimensions_match(desc_strides, dst_strides);
-    }
-
-    template <typename Scalar>
-    Dimensions dimensions() const {
-      Dimensions dimensions;
+      // Compare strides ignoring dimensions of size `1`.
       for (int i = 0; i < NumDims; ++i) {
-        eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0);
-        dimensions[i] = m_dimensions[i] / sizeof(Scalar);
+        if (desc_dims[i] == 1) continue;
+         if (desc_strides[i] != dst_strides[i]) return false;
       }
-      return dimensions;
+
+      return true;
     }
 
+   private:
+    friend class TensorBlockDescriptor;
+
+    DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}
+
     template <typename Scalar>
-    Dimensions strides() const {
-      Dimensions strides;
+    DestinationBuffer(Scalar* data, const Dimensions& dimensions,
+                      const Dimensions& strides, size_t total_dst_bytes)
+        : m_data(static_cast<void*>(data)),
+          m_dimensions(dimensions),
+          m_strides(strides),
+          m_total_dst_bytes(total_dst_bytes) {
+      // TODO(ezhulenev): Benchmark template meta-unroll for this loop.
       for (int i = 0; i < NumDims; ++i) {
-        eigen_assert(m_strides[i] % sizeof(Scalar) == 0);
-        strides[i] = m_strides[i] / sizeof(Scalar);
+        m_dimensions[i] *= sizeof(Scalar);
+        m_strides[i] *= sizeof(Scalar);
       }
-      return strides;
     }
 
     void* m_data;
@@ -181,6 +193,12 @@ class TensorBlockDescriptor {
     return *this;
   }
 
+  bool HasDestinationBuffer() const { return m_destination.m_data != NULL; }
+
+  const DestinationBuffer& GetDestinationBuffer() const {
+    return m_destination;
+  }
+
   // Returns a non-nullptr pointer to a destination buffer memory if this
   // block has a contiguous destination buffer.
   template <typename Scalar, int Layout>
@@ -191,6 +209,11 @@ class TensorBlockDescriptor {
     return NULL;
   }
 
+  // Returns a copy of `*this` with updated offset.
+  TensorBlockDescriptor WithOffset(IndexType offset) const {
+    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
+  }
+
  private:
   // Offset and dimensions are immutable after construction. Block descriptor
   // can only be mutated by adding or dropping destination.
@@ -294,18 +317,12 @@ enum TensorBlockKind {
 
   // Tensor block that was materialized directly into the final output memory
   // buffer. For example if the left side of an assignment is a Tensor, we can
-  // directly materialize the block in the destination memory. The block
-  // expression is still a valid Tensor expression, and can be used to build
-  // lazy expressions.
+  // directly materialize the block in the destination memory.
+  //
+  // If strides in the output buffer do not match tensor block strides, the
+  // Tensor expression will be invalid, and should not be used by
+  // TensorBlockAssign or for constructing another block expression.
   kMaterializedInOutput
-
-  // TODO(ezhulenev): If we know that we are evaluating a block, for the root of
-  // the expression tree, it might be beneficial to do an assignment to the
-  // output memory buffer, even if it will be impossible to construct a valid
-  // block expression after that (e.g. output memory buffer has strides not
-  // compatible with TensorMap). This might be a performance optimization for
-  // uniformly shaped blocks, because for blocks skewed towards inner dimension
-  // `kMaterializedInOutput` should always work.
 };
 #if !EIGEN_HAS_CXX11
 }  // namespace TensorBlockKind
@@ -346,6 +363,11 @@ struct XprScalar<void> {
 // Tensor), or a memory buffer allocated with scratch allocator, and in this
 // case the scratch allocator will deallocate it at the end of block based
 // expression execution.
+//
+// If the block was evaluated directly into the output buffer, and strides in
+// the output buffer do not match block strides, the TensorMap expression will
+// be invalid, and should never be used in block assignment or any other tensor
+// expression.
 
 template <typename Scalar, int NumDims, int Layout,
           typename IndexType = Eigen::Index>
@@ -358,11 +380,12 @@ class TensorMaterializedBlock {
   typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
 
   TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
-                          const Dimensions& dimensions)
+                          const Dimensions& dimensions, bool valid_expr = true)
       : m_kind(kind),
         m_data(data),
         m_dimensions(dimensions),
-        m_expr(m_data, m_dimensions) {
+        m_expr(m_data, m_dimensions),
+        m_valid_expr(valid_expr) {
     eigen_assert(m_kind == internal::TensorBlockKind::kView ||
                  m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
                  m_kind == internal::TensorBlockKind::kMaterializedInOutput);
@@ -372,7 +395,10 @@ class TensorMaterializedBlock {
   // NOTE(ezhulenev): Returning XprType by value like in other block types
   // causes asan failures. The theory is that XprType::Nested doesn't work
   // properly for TensorMap.
-  const XprType& expr() const { return m_expr; }
+  const XprType& expr() const {
+    eigen_assert(m_valid_expr);
+    return m_expr;
+  }
   const Scalar* data() const { return m_data; }
   void cleanup() {}
 
@@ -427,6 +453,7 @@ class TensorMaterializedBlock {
       bool materialized_in_output;
 
       if (block_buffer != NULL) {
+        desc.DropDestinationBuffer();
         materialized_in_output = true;
 
       } else {
@@ -461,6 +488,7 @@ class TensorMaterializedBlock {
   const Scalar* m_data;
   Dimensions m_dimensions;
   XprType m_expr;
+  bool m_valid_expr;
 };
 
 // -------------------------------------------------------------------------- //
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index dc9551d32..cc0a00e8d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -882,7 +882,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     static const bool
         is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 20591da33..7eaf1f09e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -368,7 +368,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     const Index chip_dim = m_dim.actualDim();
 
     DSizes<Index, NumInputDims> input_block_dims;
@@ -390,6 +391,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     }
 
     ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
 
     if (arg_block.data() != NULL) {
       // Forward argument block buffer if possible.
@@ -405,6 +407,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
       bool materialized_in_output;
 
       if (output_buffer != NULL) {
+        desc.DropDestinationBuffer();
         materialized_in_output = true;
 
       } else {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index cc3e67677..2a6d67ad5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -404,7 +404,8 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     return TensorBlockV2(m_impl.blockV2(desc, scratch),
                          TensorConversionOpBlockFactory());
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index d7bebd30b..132458a20 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -481,7 +481,7 @@ struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
 
 
 template <typename Dims1, typename Dims2>
-EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
   return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index b77d8fe84..4c2767d44 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -166,7 +166,8 @@ struct TensorEvaluator
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     assert(m_data != NULL);
     return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
   }
@@ -353,7 +354,8 @@ struct TensorEvaluator<const Derived, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     assert(m_data != NULL);
     return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
   }
@@ -571,7 +573,8 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor);
   }
 
@@ -729,7 +732,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     desc.DropDestinationBuffer();
     return TensorBlockV2(m_leftImpl.blockV2(desc, scratch),
                          m_rightImpl.blockV2(desc, scratch), m_functor);
@@ -993,7 +997,8 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     // It's unsafe to pass destination buffer to underlying expressions, because
     // output might be aliased with one of the inputs.
     desc.DropDestinationBuffer();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 6ad6327a6..97ac96db1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -521,19 +521,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                                       const ThreadPoolDevice& device) {
     Evaluator evaluator(expr, device);
-    Index total_size = array_prod(evaluator.dimensions());
-    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-
-    // TODO(ezuhulenev): For small expressions cost of block mapping and
-    // resource requirements gathering dominates the cost of expression
-    // evaluatiuon.
-    if (total_size < cache_size &&
-        !ExpressionHasTensorBroadcastingOp<Expression>::value) {
-      internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
-                               /*Tiling=*/TiledEvaluation::Off>::run(expr, device);
-      evaluator.cleanup();
-      return;
-    }
 
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index d98af1355..7d12e781e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -176,7 +176,8 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     assert(m_buffer != NULL);
     return TensorBlockV2::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 38d0bf7d3..c69e2df92 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -238,7 +238,8 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     static const bool is_col_major =
         static_cast<int>(Layout) == static_cast<int>(ColMajor);
 
@@ -253,6 +254,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     bool materialized_in_output;
 
     if (block_buffer != NULL) {
+      desc.DropDestinationBuffer();
       materialized_in_output = true;
 
     } else {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index c9d78ba9b..ab3a979a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -365,7 +365,8 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     eigen_assert(m_impl.data() != NULL);
     eigen_assert((kind == Runtime) ||
                  (kind == OneByN && desc.dimensions()[0] == 1) ||
@@ -611,7 +612,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     IsAligned         = false,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    BlockAccessV2     = false,
+    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,
@@ -624,7 +625,12 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   typedef typename TensorBlock::Dimensions TensorBlockDimensions;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  // Tensor slicing does not change the block type.
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+      TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -804,6 +810,15 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     m_impl.block(&input_block);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
+    TensorBlockV2 block = m_impl.blockV2(arg_desc, scratch);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+    return block;
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
     typename Storage::Type result = constCast(m_impl.data());
     if (result) {
@@ -900,7 +915,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
     IsAligned         = false,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    BlockAccessV2     = false,
+    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,
@@ -913,7 +928,8 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   typedef typename TensorBlock::Dimensions TensorBlockDimensions;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -987,6 +1003,13 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
         block.block_strides(), TensorBlockDimensions(this->m_inputStrides),
         const_cast<ScalarNoConst*>(block.data())));
   }
+
+  template<typename TensorBlockV2>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
+      const TensorBlockDesc& desc, const TensorBlockV2& block) {
+    TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
+    this->m_impl.writeBlockV2(arg_desc, block);
+  }
 };
 
 namespace internal {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index a0b4e04b1..99c74fc67 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -230,7 +230,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     // If one of the dimensions is zero, return empty block view.
     if (desc.size() == 0) {
       return TensorBlockV2(internal::TensorBlockKind::kView, NULL,
@@ -240,8 +241,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     // Check if we can reuse `desc` destination, or allocate new scratch buffer.
     ScalarNoConst* materialized_output =
         desc.template destination<ScalarNoConst, Layout>();
-
     bool materialized_in_output;
+
     if (materialized_output != NULL) {
       desc.DropDestinationBuffer();
       materialized_in_output = true;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 6e7abeb09..a51c88540 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -355,7 +355,8 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
-  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
     // TODO(ezhulenev): If underlying tensor expression supports and prefers
     // block evaluation we must use it. Currently we use coeff and packet
     // access into the underlying tensor expression.
@@ -370,10 +371,12 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     const bool inner_dim_reversed = m_reverse[inner_dim_idx];
 
     // Try to reuse destination as an output block buffer.
-    CoeffReturnType* block_buffer = desc.template destination<CoeffReturnType, Layout>();
+    CoeffReturnType* block_buffer =
+        desc.template destination<CoeffReturnType, Layout>();
     bool materialized_in_output;
 
     if (block_buffer != NULL) {
+      desc.DropDestinationBuffer();
       materialized_in_output = true;
 
     } else {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 5e8abad75..bb9908b62 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -116,7 +116,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     IsAligned         = false,
     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    BlockAccessV2     = false,
+    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,  // to be implemented
@@ -131,7 +131,12 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
       TensorBlockReader;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
@@ -143,6 +148,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     const Shuffle& shuffle = op.shufflePermutation();
     m_is_identity = true;
     for (int i = 0; i < NumDims; ++i) {
+      m_shuffle[i] = static_cast<int>(shuffle[i]);
       m_dimensions[i] = input_dims[shuffle[i]];
       m_inverseShuffle[shuffle[i]] = i;
       if (m_is_identity && shuffle[i] != i) {
@@ -241,7 +247,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
         1, m_device.firstLevelCacheSize() / sizeof(Scalar));
     resources->push_back(internal::TensorOpResourceRequirements(
         internal::kUniformAllDims, block_total_size_max));
-    m_impl.getResourceRequirements(resources);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
@@ -336,6 +341,78 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool root_of_expr_ast = false) const {
+    assert(m_impl.data() != NULL);
+
+    typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumDims, Layout>
+        TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+    ScalarNoConst* block_buffer = NULL;
+    typename TensorBlockIO::Dimensions block_strides;
+
+    bool materialized_in_output = false;
+    bool has_valid_materialized_expr = true;
+
+    if (desc.HasDestinationBuffer()) {
+      // Check if we can reuse destination buffer for block materialization.
+      const typename TensorBlockDesc::DestinationBuffer& destination_buffer =
+          desc.GetDestinationBuffer();
+
+      const bool dims_match = dimensions_match(
+          desc.dimensions(), destination_buffer.template dimensions<Scalar>());
+
+      const bool strides_match =
+          dimensions_match(internal::strides<Layout>(desc.dimensions()),
+                           destination_buffer.template strides<Scalar>());
+
+      if (dims_match && strides_match) {
+        // Destination buffer fits the block contiguously.
+        materialized_in_output = true;
+        has_valid_materialized_expr = true;
+        block_buffer = destination_buffer.template data<ScalarNoConst>();
+        block_strides = internal::strides<Layout>(desc.dimensions());
+        eigen_assert(block_buffer != NULL);
+
+      } else if (dims_match && root_of_expr_ast) {
+        // Destination buffer has strides not matching the block strides, but
+        // for the root of the expression tree it's safe to materialize anyway.
+        materialized_in_output = true;
+        has_valid_materialized_expr = false;
+        block_buffer = destination_buffer.template data<ScalarNoConst>();
+        block_strides = destination_buffer.template strides<ScalarNoConst>();
+        eigen_assert(block_buffer != NULL);
+      }
+
+      if (materialized_in_output) desc.DropDestinationBuffer();
+    }
+
+    // If we were not able to reuse destination buffer, allocate temporary
+    // buffer for block evaluation using scratch allocator.
+    if (!materialized_in_output) {
+      void* mem = scratch.allocate(desc.size() * sizeof(ScalarNoConst));
+      block_buffer = static_cast<ScalarNoConst*>(mem);
+      block_strides = internal::strides<Layout>(desc.dimensions());
+    }
+
+    typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
+    TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
+
+    TensorBlockIODst dst(desc.dimensions(), block_strides, block_buffer);
+
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    return TensorBlockV2(
+        materialized_in_output
+            ? internal::TensorBlockKind::kMaterializedInOutput
+            : internal::TensorBlockKind::kMaterializedInScratch,
+        block_buffer, desc.dimensions(), has_valid_materialized_expr);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
                                 NumDims * (2 * TensorOpCost::AddCost<Index>() +
@@ -400,7 +477,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 
   Dimensions m_dimensions;
   bool m_is_identity;
-  array<Index, NumDims> m_inverseShuffle;
+  array<int, NumDims> m_shuffle;
+  array<Index, NumDims> m_inverseShuffle;  // TODO(ezhulenev): Make it int type.
   array<Index, NumDims> m_outputStrides;
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
@@ -431,7 +509,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
     IsAligned         = false,
     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    BlockAccessV2     = false,
+    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess         = false
@@ -445,7 +523,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
       TensorBlockWriter;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -477,6 +555,63 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
                            this->m_inverseShuffle,
                            this->m_unshuffledInputStrides, this->m_impl.data());
   }
+
+template <typename TensorBlockV2>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
+      const TensorBlockDesc& desc, const TensorBlockV2& block) {
+    eigen_assert(this->m_impl.data() != NULL);
+
+    typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumDims, Layout>
+        TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+    const Scalar* block_buffer = block.data();
+
+    // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
+    // expression with coefficient and packet access as `src`.
+    void* mem = NULL;
+    if (block_buffer == NULL) {
+      mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
+      ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumDims, typename TensorBlockV2::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
+              buf),
+          block.expr());
+
+      block_buffer = buf;
+    }
+
+    // Read from block.
+    TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
+                         block_buffer);
+
+    // Write to the output buffer.
+    typename TensorBlockIO::Dimensions output_strides(
+        this->m_unshuffledInputStrides);
+    typename TensorBlockIO::Dimensions output_dimensions;
+    for (int i = 0; i < NumDims; ++i) {
+      output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
+    }
+    TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
+                         this->srcCoeff(desc.offset()));
+
+    // Reorder dimensions according to the shuffle.
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
+    for (int i = 0; i < NumDims; ++i) {
+      dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
+    }
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    // Deallocate temporary buffer used for the block materialization.
+    if (mem != NULL) this->m_device.deallocate(mem);
+  }
 };
 
 
diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
index aac75014c..eb93c5c53 100644
--- a/unsupported/test/cxx11_tensor_block_eval.cpp
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@@ -139,23 +139,50 @@ static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {
   // Evaluate TensorBlock expression into a tensor.
   Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
 
+  // Dimensions for the potential destination buffer.
+  DSizes<Index, NumDims> dst_dims;
+  if (internal::random<bool>()) {
+    dst_dims = block_params.desc.dimensions();
+  } else {
+    for (int i = 0; i < NumDims; ++i) {
+      Index extent = internal::random<Index>(0, 5);
+      dst_dims[i] = block_params.desc.dimension(i) + extent;
+    }
+  }
+
   // Maybe use this tensor as a block desc destination.
-  Tensor<T, NumDims, Layout> dst(block_params.desc.dimensions());
+  Tensor<T, NumDims, Layout> dst(dst_dims);
+  dst.setZero();
   if (internal::random<bool>()) {
     block_params.desc.template AddDestinationBuffer(
         dst.data(), internal::strides<Layout>(dst.dimensions()),
         dst.dimensions().TotalSize() * sizeof(T));
   }
 
-  auto tensor_block = eval.blockV2(block_params.desc, scratch);
-  auto b_expr = tensor_block.expr();
+  const bool root_of_expr = internal::random<bool>();
+  auto tensor_block = eval.blockV2(block_params.desc, scratch, root_of_expr);
+
+  if (tensor_block.kind() == internal::TensorBlockKind::kMaterializedInOutput) {
+    // Copy data from destination buffer.
+    if (dimensions_match(dst.dimensions(), block.dimensions())) {
+      block = dst;
+    } else {
+      DSizes<Index, NumDims> offsets;
+      for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
+      block = dst.slice(offsets, block.dimensions());
+    }
 
-  // We explicitly disable vectorization and tiling, to run a simple coefficient
-  // wise assignment loop, because it's very simple and should be correct.
-  using BlockAssign = TensorAssignOp<decltype(block), const decltype(b_expr)>;
-  using BlockExecutor = TensorExecutor<const BlockAssign, Device, false,
-                                       internal::TiledEvaluation::Off>;
-  BlockExecutor::run(BlockAssign(block, b_expr), d);
+  } else {
+    // Assign to block from expression.
+    auto b_expr = tensor_block.expr();
+
+    // We explicitly disable vectorization and tiling, to run a simple coefficient
+    // wise assignment loop, because it's very simple and should be correct.
+    using BlockAssign = TensorAssignOp<decltype(block), const decltype(b_expr)>;
+    using BlockExecutor = TensorExecutor<const BlockAssign, Device, false,
+                                         internal::TiledEvaluation::Off>;
+    BlockExecutor::run(BlockAssign(block, b_expr), d);
+  }
 
   // Cleanup temporary buffers owned by a tensor block.
   tensor_block.cleanup();
@@ -375,17 +402,16 @@ static void test_eval_tensor_generator() {
   Tensor<T, NumDims, Layout> input(dims);
   input.setRandom();
 
-  auto generator = [](const array<Index, NumDims>& dims) -> T {
+  auto generator = [](const array<Index, NumDims>& coords) -> T {
     T result = static_cast<T>(0);
     for (int i = 0; i < NumDims; ++i) {
-      result += static_cast<T>((i + 1) * dims[i]);
+      result += static_cast<T>((i + 1) * coords[i]);
     }
     return result;
   };
 
   VerifyBlockEvaluator<T, NumDims, Layout>(
-      input.generate(generator),
-      [&dims]() { return FixedSizeBlock(dims); });
+      input.generate(generator), [&dims]() { return FixedSizeBlock(dims); });
 
   VerifyBlockEvaluator<T, NumDims, Layout>(
       input.generate(generator),
@@ -403,12 +429,63 @@ static void test_eval_tensor_reverse() {
   for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
 
   VerifyBlockEvaluator<T, NumDims, Layout>(
-      input.reverse(reverse),
-      [&dims]() { return FixedSizeBlock(dims); });
+      input.reverse(reverse), [&dims]() { return FixedSizeBlock(dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.reverse(reverse), [&dims]() {
+    return RandomBlock<Layout>(dims, 1, 10);
+  });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_slice() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Pick a random slice of an input tensor.
+  DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+  DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.slice(slice_start, slice_size),
+      [&slice_size]() { return FixedSizeBlock(slice_size); });
 
   VerifyBlockEvaluator<T, NumDims, Layout>(
-      input.reverse(reverse),
-      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+      input.slice(slice_start, slice_size),
+      [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_shuffle() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+    VerifyBlockEvaluator<T, NumDims, Layout>(
+        input.shuffle(shuffle),
+        [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+    VerifyBlockEvaluator<T, NumDims, Layout>(
+        input.shuffle(shuffle), [&shuffled_dims]() {
+          return RandomBlock<Layout>(shuffled_dims, 1, 5);
+        });
+
+    break;
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
 }
 
 template <typename T, int Layout>
@@ -564,7 +641,7 @@ static void test_assign_to_tensor_chipping() {
   Index chip_dim = internal::random<int>(0, NumDims - 1);
   Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
 
-  DSizes < Index, NumDims - 1 > chipped_dims;
+  DSizes<Index, NumDims - 1> chipped_dims;
   for (Index i = 0; i < chip_dim; ++i) {
     chipped_dims[i] = dims[i];
   }
@@ -587,42 +664,111 @@ static void test_assign_to_tensor_chipping() {
       [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
 }
 
-// -------------------------------------------------------------------------- //
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_slice() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  // Pick a random slice of tensor.
+  DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+  DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return SkewedInnerBlock<Layout>(slice_size); });
 
-#define CALL_SUBTESTS_DIMS_LAYOUTS(NAME)      \
-  CALL_SUBTEST((NAME<float, 1, RowMajor>())); \
-  CALL_SUBTEST((NAME<float, 2, RowMajor>())); \
-  CALL_SUBTEST((NAME<float, 4, RowMajor>())); \
-  CALL_SUBTEST((NAME<float, 5, RowMajor>())); \
-  CALL_SUBTEST((NAME<float, 1, ColMajor>())); \
-  CALL_SUBTEST((NAME<float, 2, ColMajor>())); \
-  CALL_SUBTEST((NAME<float, 4, ColMajor>())); \
-  CALL_SUBTEST((NAME<float, 5, ColMajor>()))
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return FixedSizeBlock(slice_size); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_shuffle() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
 
-#define CALL_SUBTESTS_LAYOUTS(NAME)        \
-  CALL_SUBTEST((NAME<float, RowMajor>())); \
-  CALL_SUBTEST((NAME<float, ColMajor>()))
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+    VerifyBlockAssignment<T, NumDims, Layout>(
+        tensor, map.shuffle(shuffle),
+        [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+    VerifyBlockAssignment<T, NumDims, Layout>(
+        tensor, map.shuffle(shuffle), [&shuffled_dims]() {
+          return RandomBlock<Layout>(shuffled_dims, 1, 5);
+        });
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+// -------------------------------------------------------------------------- //
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS(PART, NAME)           \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>()))
+
+#define CALL_SUBTESTS_LAYOUTS(PART, NAME)             \
+  CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>()))
 
 EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
   // clang-format off
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_block);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_unary_expr_block);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_binary_expr_block);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_binary_with_unary_expr_block);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_broadcast);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reshape);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_cast);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_select);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_padding);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_chipping);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_generator);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reverse);
-
-  CALL_SUBTESTS_LAYOUTS(test_eval_tensor_reshape_with_bcast);
-  CALL_SUBTESTS_LAYOUTS(test_eval_tensor_forced_eval);
-
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_reshape);
-  CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_unary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_binary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_broadcast);
+  CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_cast);
+  CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_select);
+  CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_padding);
+  CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_generator);
+  CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_reverse);
+  CALL_SUBTESTS_DIMS_LAYOUTS(5, test_eval_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS(5, test_eval_tensor_shuffle);
+
+  CALL_SUBTESTS_LAYOUTS(6, test_eval_tensor_reshape_with_bcast);
+  CALL_SUBTESTS_LAYOUTS(6, test_eval_tensor_forced_eval);
+
+  CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor);
+  CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS(8, test_assign_to_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS(8, test_assign_to_tensor_shuffle);
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
+
   // clang-format on
 }
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 66f932746..dd68ddf17 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -21,6 +21,30 @@ using Eigen::internal::TiledEvaluation;
 // A set of tests to verify that different TensorExecutor strategies yields the
 // same results for all the ops, supporting tiled evaluation.
 
+// Default assignment that does no use block evaluation or vectorization.
+// We assume that default coefficient evaluation is well tested and correct.
+template <typename Dst, typename Expr>
+static void DefaultAssign(Dst& dst, Expr expr) {
+  using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+  using Executor =
+      Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
+                                      /*Vectorizable=*/false,
+                                      /*Tiling=*/TiledEvaluation::Off>;
+
+  Executor::run(Assign(dst, expr), DefaultDevice());
+}
+
+// Assignment with specified device and tiling strategy.
+template <bool Vectorizable, TiledEvaluation Tiling, typename Device,
+          typename Dst, typename Expr>
+static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
+  using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+  using Executor = Eigen::internal::TensorExecutor<const Assign, Device,
+                                                   Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+}
+
 template <int NumDims>
 static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
   array<Index, NumDims> dims;
@@ -222,30 +246,32 @@ static void test_execute_shuffle_rvalue(Device d)
   Tensor<T, NumDims, Options, Index> src(dims);
   src.setRandom();
 
-  // Create a random dimension re-ordering/shuffle.
-  std::vector<Index> shuffle;
-  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
-  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
 
-  const auto expr = src.shuffle(shuffle);
+  // Test all possible shuffle permutations.
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) {
+      shuffled_dims[i] = dims[shuffle[i]];
+    }
 
-  // We assume that shuffling on a default device is tested and correct, so
-  // we can rely on it to verify correctness of tensor executor and tiling.
-  Tensor<T, NumDims, Options, Index> golden;
-  golden = expr;
+    const auto expr = src.shuffle(shuffle);
 
-  // Now do the shuffling using configured tensor executor.
-  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+    // We assume that shuffling on a default device is tested and correct, so
+    // we can rely on it to verify correctness of tensor executor and tiling.
+    Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+    DefaultAssign(golden, expr);
 
-  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
-  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+    // Now do the shuffling using configured tensor executor.
+    Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+    DeviceAssign<Vectorizable, Tiling>(d, dst, expr);
 
-  Executor::run(Assign(dst, expr), d);
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
 
-  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
-    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
-  }
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable,
@@ -258,33 +284,30 @@ static void test_execute_shuffle_lvalue(Device d)
   Tensor<T, NumDims, Options, Index> src(dims);
   src.setRandom();
 
-  // Create a random dimension re-ordering/shuffle.
-  std::vector<Index> shuffle;
-  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
-  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
 
-  array<Index, NumDims> shuffled_dims;
-  for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
+  // Test all possible shuffle permutations.
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
 
-  // We assume that shuffling on a default device is tested and correct, so
-  // we can rely on it to verify correctness of tensor executor and tiling.
-  Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
-  golden.shuffle(shuffle) = src;
+    // We assume that shuffling on a default device is tested and correct, so
+    // we can rely on it to verify correctness of tensor executor and tiling.
+    Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+    auto golden_shuffle = golden.shuffle(shuffle);
+    DefaultAssign(golden_shuffle, src);
 
-  // Now do the shuffling using configured tensor executor.
-  Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+    // Now do the shuffling using configured tensor executor.
+    Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+    auto dst_shuffle = dst.shuffle(shuffle);
+    DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src);
 
-  auto expr = dst.shuffle(shuffle);
-
-  using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;
-  using Executor =
-      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
-
-  Executor::run(Assign(expr, src), d);
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
 
-  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
-    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
-  }
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
 }
 
 template <typename T, int NumDims, typename Device, bool Vectorizable,
@@ -723,13 +746,13 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 4);
   CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 5);
 
-  CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V2(6, test_execute_shuffle_rvalue, float, 5);
 
-  CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V1(7, test_execute_shuffle_lvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V2(7, test_execute_shuffle_lvalue, float, 5);
 
   CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 2);
   CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 3);
@@ -741,15 +764,15 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 4);
   CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 5);
 
-  CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 2);
-  CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V2(10, test_execute_slice_rvalue, float, 5);
 
-  CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 2);
-  CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V2(11, test_execute_slice_lvalue, float, 5);
 
   CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 2);
   CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 3);
-- 
cgit v1.2.3