9 files changed, 281 insertions, 52 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
index 3880e7ed3..b8c592543 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
@@ -418,12 +418,22 @@ class TensorMaterializedBlock {
 
     if (can_use_direct_access) {
       const Scalar* block_start = data + desc.offset();
-      return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start,
-                                     desc.dimensions());
+      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
+                                     block_start, desc.dimensions());
 
     } else {
-      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
-      Scalar* block_buffer = static_cast<Scalar*>(mem);
+      // Try to reuse destination as an output block buffer.
+      Scalar* block_buffer = desc.template destination<Scalar, Layout>();
+      bool materialized_in_output;
+
+      if (block_buffer != NULL) {
+        materialized_in_output = true;
+
+      } else {
+        materialized_in_output = false;
+        void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+        block_buffer = static_cast<Scalar*>(mem);
+      }
 
       typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout>
           TensorBlockIO;
@@ -438,8 +448,11 @@ class TensorMaterializedBlock {
 
       TensorBlockIO::Copy(dst, src);
 
-      return TensorMaterializedBlock(internal::TensorBlockKind::kMaterializedInScratch,
-                                     block_buffer, desc.dimensions());
+      return TensorMaterializedBlock(
+          materialized_in_output
+              ? internal::TensorBlockKind::kMaterializedInOutput
+              : internal::TensorBlockKind::kMaterializedInScratch,
+          block_buffer, desc.dimensions());
     }
   }
 
@@ -1141,7 +1154,7 @@ class TensorBlockAssignment {
       it[idx].count = 0;
       it[idx].size = target.dims[dim];
       it[idx].output_stride = target.strides[dim];
-      it[idx].output_span = it[i].output_stride * (it[i].size - 1);
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
       idx++;
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 8860840a7..20591da33 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -149,7 +149,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    BlockAccessV2     = false,
+    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
     // Chipping of outer-most dimension is a trivial operation, because we can
     // read and write directly from the underlying tensor using single offset.
     IsOuterChipping   = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
@@ -171,7 +171,17 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
       OutputTensorBlock;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef internal::TensorBlockDescriptor<NumInputDims, Index>
+      ArgTensorBlockDesc;
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -357,6 +367,72 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     m_impl.block(&input_block);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    const Index chip_dim = m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i] = i < chip_dim ? desc.dimension(i)
+                          : i > chip_dim ? desc.dimension(i - 1)
+                          : 1;
+    }
+
+    ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
+
+    // Try to reuse destination buffer for materializing argument block.
+    ScalarNoConst* destination_buffer =
+        desc.template destination<ScalarNoConst, Layout>();
+    if (destination_buffer != NULL) {
+      arg_desc.AddDestinationBuffer(
+          destination_buffer, internal::strides<Layout>(arg_desc.dimensions()),
+          (arg_desc.size() * sizeof(Scalar)));
+    }
+
+    ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch);
+
+    if (arg_block.data() != NULL) {
+      // Forward argument block buffer if possible.
+      return TensorBlockV2(arg_block.kind(), arg_block.data(),
+                           desc.dimensions());
+
+    } else {
+      // Assign argument block expression to a buffer.
+
+      // Try to reuse destination as an output buffer.
+      ScalarNoConst* output_buffer =
+          desc.template destination<ScalarNoConst, Layout>();
+      bool materialized_in_output;
+
+      if (output_buffer != NULL) {
+        materialized_in_output = true;
+
+      } else {
+        materialized_in_output = false;
+        const size_t materialized_output_size = desc.size() * sizeof(Scalar);
+        void* output_scratch_mem = scratch.allocate(materialized_output_size);
+        output_buffer = static_cast<ScalarNoConst*>(output_scratch_mem);
+      }
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              arg_desc.dimensions(),
+              internal::strides<Layout>(arg_desc.dimensions()),
+              output_buffer),
+          arg_block.expr());
+
+      return TensorBlockV2(
+          materialized_in_output
+              ? internal::TensorBlockKind::kMaterializedInOutput
+              : internal::TensorBlockKind::kMaterializedInScratch,
+          output_buffer, desc.dimensions());
+    }
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
     typename Storage::Type result = constCast(m_impl.data());
     if (isOuterChipping() && result) {
@@ -434,11 +510,12 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned    = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess  = TensorEvaluator<ArgType, Device>::BlockAccess,
-    Layout       = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess    = false
+    IsAligned     = false,
+    PacketAccess  = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess   = TensorEvaluator<ArgType, Device>::BlockAccess,
+    BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
+    Layout        = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess     = false
   };
 
   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
@@ -448,6 +525,10 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
       OutputTensorBlock;
 
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
@@ -539,6 +620,36 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
         input_block_strides, this->m_inputStrides,
         const_cast<ScalarNoConst*>(output_block.data())));
   }
+
+  template <typename TensorBlockV2>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
+      const TensorBlockDesc& desc, const TensorBlockV2& block) {
+    assert(this->m_impl.data() != NULL);
+
+    const Index chip_dim = this->m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i] = i < chip_dim ? desc.dimension(i)
+                          : i > chip_dim ? desc.dimension(i - 1)
+                          : 1;
+    }
+
+    typedef TensorReshapingOp<const DSizes<Index, NumInputDims>,
+                              const typename TensorBlockV2::XprType>
+        TensorBlockExpr;
+
+    typedef internal::TensorBlockAssignment<Scalar, NumInputDims,
+                                            TensorBlockExpr, Index>
+        TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(
+            input_block_dims,
+            internal::strides<Layout>(this->m_impl.dimensions()),
+            this->m_impl.data(), this->srcCoeff(desc.offset())),
+        block.expr().reshape(input_block_dims));
+  }
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index b1d668744..b77d8fe84 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -53,18 +53,22 @@ struct TensorEvaluator
     RawAccess          = true
   };
 
-  typedef typename internal::TensorBlock<
-      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
       TensorBlock;
-  typedef typename internal::TensorBlockReader<
-      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+  typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
       TensorBlockReader;
-  typedef typename internal::TensorBlockWriter<
-      typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+  typedef typename internal::TensorBlockWriter<ScalarNoConst, Index, NumCoords, Layout>
       TensorBlockWriter;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
   typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
+                                                     Layout, Index>
+      TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
@@ -161,6 +165,12 @@ struct TensorEvaluator
     TensorBlockReader::Run(block, m_data);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    assert(m_data != NULL);
+    return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
       const TensorBlock& block) {
     assert(m_data != NULL);
@@ -269,11 +279,6 @@ struct TensorEvaluator<const Derived, Device>
   typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
-  typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumCoords, Layout>
-      TensorBlockIO;
-  typedef typename TensorBlockIO::Dst TensorBlockIODst;
-  typedef typename TensorBlockIO::Src TensorBlockIOSrc;
-
   typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
                                                      Layout, Index>
       TensorBlockV2;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 97ac96db1..6ad6327a6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -521,6 +521,19 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
   static EIGEN_STRONG_INLINE void run(const Expression& expr,
                                       const ThreadPoolDevice& device) {
     Evaluator evaluator(expr, device);
+    Index total_size = array_prod(evaluator.dimensions());
+    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+
+    // TODO(ezuhulenev): For small expressions cost of block mapping and
+    // resource requirements gathering dominates the cost of expression
+    // evaluatiuon.
+    if (total_size < cache_size &&
+        !ExpressionHasTensorBroadcastingOp<Expression>::value) {
+      internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+                               /*Tiling=*/TiledEvaluation::Off>::run(expr, device);
+      evaluator.cleanup();
+      return;
+    }
 
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
     if (needs_assign) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 8d45bd62a..d98af1355 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -97,21 +97,26 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
     IsAligned         = true,
     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess       = internal::is_arithmetic<CoeffReturnType>::value,
-    BlockAccessV2     = false,
+    BlockAccessV2     = internal::is_arithmetic<CoeffReturnType>::value,
     PreferBlockAccess = false,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess         = true
   };
 
-  typedef typename internal::TensorBlock<
-      CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
+  static const int NumDims = internal::traits<ArgType>::NumDimensions;
+
+  typedef typename internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout>
       TensorBlock;
-  typedef typename internal::TensorBlockReader<
-      CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
+  typedef typename internal::TensorBlockReader<CoeffReturnType, Index, NumDims, Layout>
       TensorBlockReader;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+                                                     Layout, Index>
+      TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -170,6 +175,12 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
     TensorBlockReader::Run(block, m_buffer);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    assert(m_buffer != NULL);
+    return TensorBlockV2::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 5d4b0f061..c9d78ba9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -644,6 +644,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       }
     }
 
+    // No strides for scalars.
+    if (NumDims == 0) return;
+
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Sizes& output_dims = op.sizes();
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index f3907be6e..a0b4e04b1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -334,8 +334,12 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
         // Want to copy from input.
         (output_inner_dim_size - output_inner_pad_before_size),
         // Can copy from input.
-        (static_cast<Index>(m_impl.dimensions()[inner_dim_idx]) -
-         numext::maxi(input_offsets[inner_dim_idx], Index(0))));
+        numext::maxi(
+            static_cast<Index>(m_impl.dimensions()[inner_dim_idx]) -
+                (input_offsets[inner_dim_idx] + output_inner_pad_before_size),
+            Index(0)));
+
+    eigen_assert(output_inner_copy_size >= 0);
 
     // How many values to fill with padding AFTER reading from the input inner
     // dimension.
diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
index 1dc0a9e2c..e11092af3 100644
--- a/unsupported/test/cxx11_tensor_block_eval.cpp
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@@ -82,14 +82,14 @@ static TensorBlockParams<NumDims> SkewedInnerBlock(
       index -= idx * strides[i];
       offsets[i] = idx;
     }
-    offsets[0] = index;
+    if (NumDims > 0) offsets[0] = index;
   } else {
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index idx = index / strides[i];
       index -= idx * strides[i];
       offsets[i] = idx;
     }
-    offsets[NumDims - 1] = index;
+    if (NumDims > 0) offsets[NumDims - 1] = index;
   }
 
   auto desc = TensorBlockDescriptor<NumDims>(block.first_coeff_index(), sizes);
@@ -333,6 +333,42 @@ static void test_eval_tensor_padding() {
       [&padded_dims]() { return SkewedInnerBlock<Layout>(padded_dims); });
 }
 
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_chipping() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  Index chip_dim = internal::random<int>(0, NumDims - 1);
+  Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+  DSizes<Index, NumDims - 1> chipped_dims;
+  for (Index i = 0; i < chip_dim; ++i) {
+    chipped_dims[i] = dims[i];
+  }
+  for (Index i = chip_dim + 1; i < NumDims; ++i) {
+    chipped_dims[i - 1] = dims[i];
+  }
+
+  // Block buffer forwarding.
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+  // Block expression assignment.
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.square().chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.square().chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+}
+
 template <typename T, int Layout>
 static void test_eval_tensor_reshape_with_bcast() {
   Index dim = internal::random<Index>(1, 100);
@@ -384,8 +420,8 @@ static void test_eval_tensor_forced_eval() {
 // as an assignment to TensorSliceOp (writing a block is is identical to
 // assigning one tensor to a slice of another tensor).
 
-template <typename T, int NumDims, int Layout, typename Expression,
-          typename GenBlockParams>
+template <typename T, int NumDims, int Layout, int NumExprDims = NumDims,
+          typename Expression, typename GenBlockParams>
 static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
                                   Expression expr, GenBlockParams gen_block) {
   using Device = DefaultDevice;
@@ -395,17 +431,17 @@ static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
   auto eval = TensorEvaluator<decltype(expr), Device>(expr, d);
 
   // Generate a random block, or choose a block that fits in full expression.
-  TensorBlockParams<NumDims> block_params = gen_block();
+  TensorBlockParams<NumExprDims> block_params = gen_block();
 
   // Generate random data of the selected block size.
-  Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
+  Tensor<T, NumExprDims, Layout> block(block_params.desc.dimensions());
   block.setRandom();
 
   // ************************************************************************ //
   // (1) Assignment from a block.
 
   // Construct a materialize block from a random generated block tensor.
-  internal::TensorMaterializedBlock<T, NumDims, Layout> blk(
+  internal::TensorMaterializedBlock<T, NumExprDims, Layout> blk(
       internal::TensorBlockKind::kView, block.data(), block.dimensions());
 
   // Reset all underlying tensor values to zero.
@@ -478,6 +514,37 @@ static void test_assign_to_tensor_reshape() {
       [&shuffled]() { return FixedSizeBlock(shuffled); });
 }
 
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_chipping() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  Index chip_dim = internal::random<int>(0, NumDims - 1);
+  Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+  DSizes < Index, NumDims - 1 > chipped_dims;
+  for (Index i = 0; i < chip_dim; ++i) {
+    chipped_dims[i] = dims[i];
+  }
+  for (Index i = chip_dim + 1; i < NumDims; ++i) {
+    chipped_dims[i - 1] = dims[i];
+  }
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return SkewedInnerBlock<Layout>(chipped_dims); });
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+}
+
 // -------------------------------------------------------------------------- //
 
 #define CALL_SUBTESTS_DIMS_LAYOUTS(NAME)      \
@@ -503,12 +570,15 @@ EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
   CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_broadcast);
   CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reshape);
   CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_cast);
+  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_select);
   CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_padding);
+  CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_chipping);
 
   CALL_SUBTESTS_LAYOUTS(test_eval_tensor_reshape_with_bcast);
   CALL_SUBTESTS_LAYOUTS(test_eval_tensor_forced_eval);
 
   CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor);
   CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_chipping);
   // clang-format on
 }
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index efae81961..8fb4ba752 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -180,9 +180,8 @@ static void test_execute_chipping_lvalue(Device d)
                                                                             \
     const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1);   \
                                                                             \
-    /* Generate random data to fill non-chipped dimensions*/                \
     Tensor<T, NumDims, Layout, Index> random(dims);                         \
-    random.setRandom();                                                     \
+    random.setZero();                                                       \
                                                                             \
     Tensor<T, NumDims, Layout, Index> golden(dims);                         \
     golden = random;                                                        \
@@ -716,13 +715,13 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 4);
   CALL_SUBTEST_COMBINATIONS_V2(3, test_execute_broadcasting, float, 5);
 
-  CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V1(4, test_execute_chipping_rvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V2(4, test_execute_chipping_rvalue, float, 5);
 
-  CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V1(5, test_execute_chipping_lvalue, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V2(5, test_execute_chipping_lvalue, float, 5);
 
   CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 3);
   CALL_SUBTEST_COMBINATIONS_V1(6, test_execute_shuffle_rvalue, float, 4);
@@ -752,10 +751,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
   CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 4);
   CALL_SUBTEST_COMBINATIONS_V1(11, test_execute_slice_lvalue, float, 5);
 
-  CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 2);
-  CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 3);
-  CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 4);
-  CALL_SUBTEST_COMBINATIONS_V1(12, test_execute_broadcasting_of_forced_eval, float, 5);
+  CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 2);
+  CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 3);
+  CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 4);
+  CALL_SUBTEST_COMBINATIONS_V2(12, test_execute_broadcasting_of_forced_eval, float, 5);
 
   CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 2);
   CALL_SUBTEST_COMBINATIONS_V1(13, test_execute_generator_op, float, 3);