Add block evaluation to TensorReshaping/TensorCasting/TensorPadding/TensorSelect

author: Eugene Zhulenev <ezhulenev@google.com> 2019-10-02 12:44:06 -0700
committer: Eugene Zhulenev <ezhulenev@google.com> 2019-10-02 12:44:06 -0700
commit: 60ae24ee1a6c16114de456d77fcfba6f5a1160ca (patch)
tree: 7b9d5463018055571a5050ca31a8d3df12a3e6fc /unsupported/Eigen/CXX11/src
parent: 6e40454a6e6cc57c07c7340148657c985ca6c928 (diff)
7 files changed, 692 insertions, 144 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
index 25047b8e5..4d2145bf3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
@@ -12,13 +12,18 @@ namespace Eigen {
 namespace internal {
 
 // -------------------------------------------------------------------------- //
+// Forward declarations for templates defined below.
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIOV2;
+
+// -------------------------------------------------------------------------- //
 // Helper function to compute strides for densely stored buffer of given
 // dimensions.
 
 // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
 // this function instead everywhere.
 template <int Layout, typename IndexType, int NumDims>
-EIGEN_STRONG_INLINE DSizes<IndexType, NumDims> strides(
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
     const DSizes<IndexType, NumDims>& dimensions) {
   DSizes<IndexType, NumDims> strides;
   if (NumDims == 0) return strides;
@@ -40,6 +45,14 @@ EIGEN_STRONG_INLINE DSizes<IndexType, NumDims> strides(
   return strides;
 }
 
+#if EIGEN_HAS_CXX11
+template <int Layout, std::ptrdiff_t... Indices>
+EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
+    const Sizes<Indices...>& sizes) {
+  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
+}
+#endif
+
 // -------------------------------------------------------------------------- //
 // TensorBlockDescriptor specifies a block offset within a tensor and the block
 // sizes along each of the tensor dimensions.
@@ -155,6 +168,14 @@ class TensorBlockDescriptor {
         DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes);
   }
 
+  template <typename Scalar, typename DstStridesIndexType>
+  void AddDestinationBuffer(
+      Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides,
+      size_t total_dst_bytes) {
+    // DSizes constructor will do index type promotion if it's safe.
+    AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes);
+  }
+
   TensorBlockDescriptor& DropDestinationBuffer() {
     m_destination.m_data = NULL;
     return *this;
@@ -333,10 +354,11 @@ class TensorMaterializedBlock {
   typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
 #endif
  public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
   typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
 
   TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
-                          const DSizes<IndexType, NumDims>& dimensions)
+                          const Dimensions& dimensions)
       : m_kind(kind),
         m_data(data),
         m_dimensions(dimensions),
@@ -352,18 +374,84 @@ class TensorMaterializedBlock {
   // properly for TensorMap.
   const XprType& expr() const { return m_expr; }
   const Scalar* data() const { return m_data; }
-
   void cleanup() {}
 
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+
+  // Creates a materialized block for the given descriptor from a memory buffer.
+  template <typename DataDimensions, typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
+      const Scalar* data, const DataDimensions& data_dims,
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
+
+    // If a tensor block dimensions covers a contiguous block of the underlying
+    // memory, we can skip block buffer memory allocation, and construct a block
+    // from existing `data` memory buffer.
+    //
+    // Example: (RowMajor layout)
+    //   data_dims:          [11, 12, 13, 14]
+    //   desc.dimensions():  [1,   1,  3, 14]
+    //
+    // In this case we can construct a TensorBlock starting at
+    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Find out how many inner dimensions have a matching size.
+    int num_matching_inner_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (data_dims[dim] != desc.dimensions()[dim]) break;
+      ++num_matching_inner_dims;
+    }
+
+    // All the outer dimensions must be of size `1`, except a single dimension
+    // before the matching inner dimension (`3` in the example above).
+    bool can_use_direct_access = true;
+    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (desc.dimension(dim) != 1) {
+        can_use_direct_access = false;
+        break;
+      }
+    }
+
+    if (can_use_direct_access) {
+      const Scalar* block_start = data + desc.offset();
+      return TensorMaterializedBlock(TensorBlockKind::kView, block_start,
+                                     desc.dimensions());
+
+    } else {
+      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+      Scalar* block_buffer = static_cast<Scalar*>(mem);
+
+      typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout>
+          TensorBlockIO;
+      typedef typename TensorBlockIO::Dst TensorBlockIODst;
+      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
+                           data, desc.offset());
+      TensorBlockIODst dst(desc.dimensions(),
+                           internal::strides<Layout>(desc.dimensions()),
+                           block_buffer);
+
+      TensorBlockIO::Copy(dst, src);
+
+      return TensorMaterializedBlock(TensorBlockKind::kMaterializedInScratch,
+                                     block_buffer, desc.dimensions());
+    }
+  }
+
  private:
   TensorBlockKind m_kind;
   const Scalar* m_data;
-  DSizes<IndexType, NumDims> m_dimensions;
+  Dimensions m_dimensions;
   XprType m_expr;
 };
 
 // -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression that applies UnaryOp
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
 // functor to the blocks produced by the underlying Tensor expression.
 
 template <typename UnaryOp, typename ArgTensorBlock>
@@ -398,7 +486,7 @@ class TensorCwiseUnaryBlock {
 };
 
 // -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression that applies BinaryOp
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
 // functor to the blocks produced by the underlying Tensor expression.
 
 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
@@ -447,6 +535,96 @@ class TensorCwiseBinaryBlock {
 };
 
 // -------------------------------------------------------------------------- //
+// TensorUnaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from a block of the underlying type (this is a
+// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
+
+template <typename BlockFactory, typename ArgTensorBlock>
+class TensorUnaryExprBlock {
+#if !EIGEN_HAS_CXX11
+  typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
+#endif
+
+  typedef typename ArgTensorBlock::XprType ArgXprType;
+  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
+                       const BlockFactory& factory)
+      : m_arg_block(arg_block), m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorTernaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from three blocks of the underlying type.
+
+template <typename BlockFactory, typename Arg1TensorBlock,
+          typename Arg2TensorBlock, typename Arg3TensorBlock>
+class TensorTernaryExprBlock {
+#if !EIGEN_HAS_CXX11
+  typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
+#endif
+
+  typedef typename Arg1TensorBlock::XprType Arg1XprType;
+  typedef typename Arg2TensorBlock::XprType Arg2XprType;
+  typedef typename Arg3TensorBlock::XprType Arg3XprType;
+
+  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
+                                       internal::is_void<Arg2XprType>::value ||
+                                       internal::is_void<Arg3XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
+                                              Arg3XprType>::type>::type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
+                         const Arg2TensorBlock& arg2_block,
+                         const Arg3TensorBlock& arg3_block,
+                         const BlockFactory& factory)
+      : m_arg1_block(arg1_block),
+        m_arg2_block(arg2_block),
+        m_arg3_block(arg3_block),
+        m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const {
+    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
+                          m_arg3_block.expr());
+  }
+  const Scalar* data() const { return NULL; }
+  void cleanup() {
+    m_arg1_block.cleanup();
+    m_arg2_block.cleanup();
+    m_arg3_block.cleanup();
+  }
+
+ private:
+  Arg1TensorBlock m_arg1_block;
+  Arg2TensorBlock m_arg2_block;
+  Arg3TensorBlock m_arg3_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
 // StridedLinearBufferCopy provides a method to copy data between two linear
 // buffers with different strides, with optimized paths for scatter/gather.
 
@@ -547,7 +725,13 @@ class StridedLinearBufferCopy {
     } else if (kind == FillLinear) {
       // Fill `dst` with value at `*src`.
       eigen_assert(src_stride == 0 && dst_stride == 1);
+      const IndexType unrolled_size = count - 4 * PacketSize;
       Packet p = pload1<Packet>(src);
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
       for (; i <= vectorized_size; i += PacketSize) {
         pstoreu<Scalar, Packet>(dst + i, p);
       }
@@ -809,15 +993,15 @@ class TensorBlockIOV2 {
 
 // -------------------------------------------------------------------------- //
 // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
-// a Tensor block defined by `desc`, backed by a memory buffer at `dst` address.
+// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
 //
 // Currently there is no way to write from a Tensor expression to a block of
 // memory, if dimensions are reordered. If you need to do that, you should
 // materialize a Tensor block expression into a memory buffer, and then use
 // TensorBlockIO to copy data between two memory buffers with a custom
-// `dst->src` dimension map (see definition above).
+// `target->src` dimension map (see definition above).
 //
-// Also currently the innermost dimension of `dst` must have a stride '1'
+// Also currently the innermost dimension of `target` must have a stride '1'
 // (contiguous in memory). This restriction could be lifted with a `pscatter`,
 // but in practice it's never needed, and there is a similar TensorBlockIO
 // workaround for that.
@@ -842,18 +1026,18 @@ class TensorBlockAssignment {
 
   template <bool Vectorizable, typename Evaluator>
   struct InnerDimAssign {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count,
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
                                         const Evaluator& eval,
                                         IndexType eval_offset) {
       for (IndexType i = 0; i < count; ++i) {
-        dst[i] = eval.coeff(eval_offset + i);
+        target[i] = eval.coeff(eval_offset + i);
       }
     }
   };
 
   template <typename Evaluator>
   struct InnerDimAssign<true, Evaluator> {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count,
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
                                         const Evaluator& eval,
                                         IndexType eval_offset) {
       typedef typename packet_traits<Scalar>::type Packet;
@@ -866,26 +1050,29 @@ class TensorBlockAssignment {
         for (int j = 0; j < 4; ++j) {
           const IndexType idx = eval_offset + i + j * PacketSize;
           Packet p = eval.template packet<Unaligned>(idx);
-          pstoreu<Scalar>(dst + i + j * PacketSize, p);
+          pstoreu<Scalar>(target + i + j * PacketSize, p);
         }
       }
 
       for (; i <= vectorized_size; i += PacketSize) {
         Packet p = eval.template packet<Unaligned>(eval_offset + i);
-        pstoreu<Scalar>(dst + i, p);
+        pstoreu<Scalar>(target + i, p);
       }
 
       for (; i < count; ++i) {
-        dst[i] = eval.coeff(eval_offset + i);
+        target[i] = eval.coeff(eval_offset + i);
       }
     }
   };
 
  public:
-  struct Dst {
-    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
-        IndexType dst_offset = 0)
-        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
+  struct Target {
+    Target(const Dimensions& target_dims, const Dimensions& target_strides,
+           Scalar* target_data, IndexType target_offset = 0)
+        : dims(target_dims),
+          strides(target_strides),
+          data(target_data),
+          offset(target_offset) {}
 
     Dimensions dims;
     Dimensions strides;
@@ -893,34 +1080,50 @@ class TensorBlockAssignment {
     IndexType offset;
   };
 
+  static Target target(const Dimensions& target_dims,
+                       const Dimensions& target_strides, Scalar* target_data,
+                       IndexType target_offset = 0) {
+    return Target(target_dims, target_strides, target_data, target_offset);
+  }
+
+  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
+  static Target target(
+      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
+      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
+      Scalar* target_data, IndexType target_offset = 0) {
+    // DSizes constructor will do index type promotion if it's safe.
+    return Target(Dimensions(target_dims), Dimensions(target_strides),
+                  target_data, target_offset);
+  }
+
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Dst& dst, const TensorBlockExpr& expr) {
+      const Target& target, const TensorBlockExpr& expr) {
     // Prepare evaluator for block expression.
     DefaultDevice default_device;
     TensorBlockEvaluator eval(expr, default_device);
 
     // Tensor block expression dimension should match destination dimensions.
-    eigen_assert(dimensions_match(dst.dims, eval.dimensions()));
+    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
 
     static const int Layout = TensorBlockEvaluator::Layout;
     static const bool is_col_major = Layout == ColMajor;
 
     // Initialize output inner dimension size based on a layout.
-    const IndexType output_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
+    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
     const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
-    IndexType output_inner_dim_size = dst.dims[inner_dim_idx];
+    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
 
-    // Dst inner dimension stride must be '1'.
-    eigen_assert(dst.strides[inner_dim_idx] == 1);
+    // Target inner dimension stride must be '1'.
+    eigen_assert(target.strides[inner_dim_idx] == 1);
 
-    // Squeeze multiple inner dims into one if they are contiguous in `dst`.
+    // Squeeze multiple inner dims into one if they are contiguous in `target`.
     IndexType num_squeezed_dims = 0;
     for (Index i = 1; i < NumDims; ++i) {
       const Index dim = is_col_major ? i : NumDims - i - 1;
-      const IndexType dst_stride = dst.strides[dim];
+      const IndexType target_stride = target.strides[dim];
 
-      if (output_inner_dim_size == dst_stride) {
-        output_inner_dim_size *= dst.dims[dim];
+      if (output_inner_dim_size == target_stride) {
+        output_inner_dim_size *= target.dims[dim];
         num_squeezed_dims++;
       } else {
         break;
@@ -936,22 +1139,22 @@ class TensorBlockAssignment {
       const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
 
       it[idx].count = 0;
-      it[idx].size = dst.dims[dim];
-      it[idx].output_stride = dst.strides[dim];
+      it[idx].size = target.dims[dim];
+      it[idx].output_stride = target.strides[dim];
       it[idx].output_span = it[i].output_stride * (it[i].size - 1);
       idx++;
     }
 
     // We read block expression from the beginning, and start writing data to
-    // `dst` at given offset.
+    // `target` at given offset.
     IndexType input_offset = 0;
-    IndexType output_offset = dst.offset;
+    IndexType output_offset = target.offset;
 
-    // Iterate copying data from `eval` to `dst`.
+    // Iterate copying data from `eval` to `target`.
     for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
-      // Assign to `dst` at current offset.
+      // Assign to `target` at current offset.
       InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
-                     TensorBlockEvaluator>::Run(dst.data + output_offset,
+                     TensorBlockEvaluator>::Run(target.data + output_offset,
                                                 output_inner_dim_size, eval,
                                                 input_offset);
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 9e4fae99a..dc9551d32 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -1247,10 +1247,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
           ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
           TensorBlockAssignment;
 
-      typename TensorBlockAssignment::Dst assignment_dst(
-          input_block_sizes, input_block_strides, *materialized_input);
-
-      TensorBlockAssignment::Run(assignment_dst, input_block.expr());
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(input_block_sizes, input_block_strides,
+                                        *materialized_input),
+          input_block.expr());
 
       input_buffer = *materialized_input;
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index a8160e17e..cc3e67677 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -294,23 +294,45 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess =
+    IsAligned         = false,
+    PacketAccess      =
     #ifndef EIGEN_USE_SYCL
-    true,
+                        true,
     #else
-    TensorEvaluator<ArgType, Device>::PacketAccess &
-        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+                        TensorEvaluator<ArgType, Device>::PacketAccess &
+                        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
     #endif
-    BlockAccess = false,
-    BlockAccessV2 = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = false
+    BlockAccess       = false,
+    BlockAccessV2     = TensorEvaluator<ArgType, Device>::BlockAccessV2,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+      ArgTensorBlock;
+
+  struct TensorConversionOpBlockFactory {
+    template <typename ArgXprType>
+    struct XprType {
+      typedef TensorConversionOp<TargetType, const ArgXprType> type;
+    };
+
+    template <typename ArgXprType>
+    typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const {
+      return typename XprType<ArgXprType>::type(expr);
+    }
+  };
+
+  typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory,
+                                         ArgTensorBlock>
+      TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -376,6 +398,17 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    return TensorBlockV2(m_impl.blockV2(desc, scratch),
+                         TensorConversionOpBlockFactory());
+  }
+
   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
   /// required by sycl in order to extract the sycl accessor
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index c87075a72..b1d668744 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -176,11 +176,12 @@ struct TensorEvaluator
     typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
                                             Index>
         TensorBlockAssign;
-    typename TensorBlockAssign::Dst dst(desc.dimensions(),
-                                        internal::strides<Layout>(m_dims),
-                                        m_data, desc.offset());
 
-    TensorBlockAssign::Run(dst, block.expr());
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(desc.dimensions(),
+                                  internal::strides<Layout>(m_dims), m_data,
+                                  desc.offset()),
+        block.expr());
   }
 
   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
@@ -349,62 +350,7 @@ struct TensorEvaluator<const Derived, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
   blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
     assert(m_data != NULL);
-
-    // TODO(ezhulenev): Move it to TensorBlockV2 and reuse in TensorForcedEval.
-
-    // If a tensor block descriptor covers a contiguous block of the underlying
-    // memory, we can skip block buffer memory allocation, and construct a block
-    // from existing `m_data` memory buffer.
-    //
-    // Example: (RowMajor layout)
-    //   m_dims:             [11, 12, 13, 14]
-    //   desc.dimensions():  [1,   1,  3, 14]
-    //
-    // In this case we can construct a TensorBlock starting at
-    // `m_data + desc.offset()`, with a `desc.dimensions()` block sizes.
-
-    static const bool
-        is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
-
-    // Find out how many inner dimensions have a matching size.
-    int num_matching_inner_dims = 0;
-    for (int i = 0; i < NumCoords; ++i) {
-      int dim = is_col_major ? i : NumCoords - i - 1;
-      if (m_dims[dim] != desc.dimensions()[dim]) break;
-      ++num_matching_inner_dims;
-    }
-
-    // All the outer dimensions must be of size `1`, except a single dimension
-    // before the matching inner dimension (`3` in the example above).
-    bool can_use_direct_access = true;
-    for (int i = num_matching_inner_dims + 1; i < NumCoords; ++i) {
-      int dim = is_col_major ? i : NumCoords - i - 1;
-      if (desc.dimension(dim) != 1) {
-        can_use_direct_access = false;
-        break;
-      }
-    }
-
-    if (can_use_direct_access) {
-      EvaluatorPointerType block_start = m_data + desc.offset();
-      return TensorBlockV2(internal::TensorBlockKind::kView, block_start,
-                           desc.dimensions());
-
-    } else {
-      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
-      ScalarNoConst* block_buffer = static_cast<ScalarNoConst*>(mem);
-
-      TensorBlockIOSrc src(internal::strides<Layout>(m_dims), m_data,
-                           desc.offset());
-      TensorBlockIODst dst(desc.dimensions(),
-                           internal::strides<Layout>(desc.dimensions()),
-                           block_buffer);
-
-      TensorBlockIO::Copy(dst, src);
-
-      return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch,
-                           block_buffer, desc.dimensions());
-    }
+    return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
   }
 
   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
@@ -923,15 +869,21 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   typedef typename XprType::Scalar Scalar;
 
   enum {
-    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
-                    PacketType<Scalar, Device>::HasBlend,
-    BlockAccess = false,
-    BlockAccessV2 = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<IfArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = TensorEvaluator<ThenArgType, Device>::IsAligned &
+                        TensorEvaluator<ElseArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ThenArgType, Device>::PacketAccess &
+                        TensorEvaluator<ElseArgType, Device>::PacketAccess &
+                        PacketType<Scalar, Device>::HasBlend,
+    BlockAccess       = false,
+    BlockAccessV2     = TensorEvaluator<IfArgType, Device>::BlockAccessV2 &&
+                        TensorEvaluator<ThenArgType, Device>::BlockAccessV2 &&
+                        TensorEvaluator<ElseArgType, Device>::BlockAccessV2,
+    PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<IfArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -953,8 +905,36 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   typedef StorageMemory<CoeffReturnType, Device> Storage;
   typedef typename Storage::Type EvaluatorPointerType;
 
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+    typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlockV2
+      IfArgTensorBlock;
+  typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlockV2
+      ThenArgTensorBlock;
+  typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlockV2
+      ElseArgTensorBlock;
+
+  struct TensorSelectOpBlockFactory {
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    struct XprType {
+      typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type;
+    };
+
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr(
+        const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const {
+      return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr);
+    }
+  };
+
+  typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory,
+                                           IfArgTensorBlock, ThenArgTensorBlock,
+                                           ElseArgTensorBlock>
+      TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -1000,6 +980,24 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
         .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    m_condImpl.getResourceRequirements(resources);
+    m_thenImpl.getResourceRequirements(resources);
+    m_elseImpl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    // It's unsafe to pass destination buffer to underlying expressions, because
+    // output might be aliased with one of the inputs.
+    desc.DropDestinationBuffer();
+
+    return TensorBlockV2(
+        m_condImpl.blockV2(desc, scratch), m_thenImpl.blockV2(desc, scratch),
+        m_elseImpl.blockV2(desc, scratch), TensorSelectOpBlockFactory());
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
 
 #ifdef EIGEN_USE_SYCL
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index be8f3a734..2a3398d67 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -324,6 +324,17 @@ struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
   }
 };
 
+template <typename FirstType, typename... OtherTypes>
+std::ostream& operator<<(std::ostream& os,
+                         const IndexList<FirstType, OtherTypes...>& dims) {
+  os << "[";
+  for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
+}
 
 template<typename FirstType, typename... OtherTypes>
 constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index c8333e488..5d4b0f061 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -113,6 +113,25 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   static const int NumOutputDims = internal::array_size<Dimensions>::value;
   static const int NumInputDims  = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
 
+  enum ReshapingKind {
+    // We do not use layout information to determine reshaping kind.
+    // Depending on the layout `N` can be inner or outer dimension.
+    OneByN = 0,  // expr.reshape(1, N)
+    NByOne = 1,  // expr.reshape(N, 1)
+    Runtime = 2  // Reshape dimensions are dynamic (specified at runtime).
+  };
+
+  // clang-format off
+  static const ReshapingKind kind =
+#if defined(EIGEN_HAS_INDEX_LIST)
+        (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN
+      : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne
+      : Runtime;
+#else
+        Runtime;
+#endif
+  // clang-format on
+
   enum {
     IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
@@ -121,8 +140,12 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess &&
                         TensorEvaluator<ArgType, Device>::RawAccess &&
                         NumInputDims > 0 && NumOutputDims > 0,
-    BlockAccessV2     = false,
-    PreferBlockAccess = true,
+    // For trivial reshapes with raw access to underlying data we will provide
+    // zero overhead block access.
+    // TODO(ezhulenev): Consider adding block access without raw access?
+    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess &&
+                        NumInputDims > 0 && NumOutputDims > 0,
+    PreferBlockAccess = false,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,  // to be implemented
     RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
@@ -139,7 +162,13 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
       OutputTensorBlockReader;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef
+      typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims,
+                                                 Layout, Index>
+          TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -199,8 +228,9 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
-      std::vector<internal::TensorOpResourceRequirements>* resources) const {
-    m_impl.getResourceRequirements(resources);
+      std::vector<internal::TensorOpResourceRequirements>*) const {
+    // TODO(ezhulenev): If we'll ever support block evaluation without raw
+    // access we'll need to get requirements from `m_impl`.
   }
 
   // required in block(OutputTensorBlock* output_block) const
@@ -334,6 +364,26 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    eigen_assert(m_impl.data() != NULL);
+    eigen_assert((kind == Runtime) ||
+                 (kind == OneByN && desc.dimensions()[0] == 1) ||
+                 (kind == NByOne && desc.dimensions()[1] == 1));
+
+    if (kind == OneByN || kind == NByOne) {
+      // We can guarantee at compile time that block is just a contiguous slice
+      // of the underlying expression memory buffer.
+      return TensorBlockV2(internal::TensorBlockKind::kView,
+                           m_impl.data() + desc.offset(), desc.dimensions());
+    } else {
+      // This will do additional runtime checks, and in the end it might be also
+      // a view, or it might be a block materialized in the temporary buffer.
+      return TensorBlockV2::materialize(m_impl.data(), m_dimensions, desc,
+                                        scratch);
+    }
+  }
+
   EIGEN_DEVICE_FUNC typename Storage::Type data() const {
     return constCast(m_impl.data());
   }
@@ -365,14 +415,14 @@ template<typename NewDimensions, typename ArgType, typename Device>
   typedef NewDimensions Dimensions;
 
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    BlockAccessV2 = false,
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
     PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -385,18 +435,37 @@ template<typename NewDimensions, typename ArgType, typename Device>
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index>
+      TensorBlockDesc;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
     return this->m_impl.coeffRef(index);
   }
+
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
     this->m_impl.template writePacket<StoreMode>(index, x);
   }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    assert(this->m_impl.data() != NULL);
+
+    typedef typename TensorBlock::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<
+        Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index>
+        TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(desc.dimensions(),
+                                  internal::strides<Layout>(this->dimensions()),
+                                  this->m_impl.data(), desc.offset()),
+        block.expr());
+  }
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 7b9ad7374..be2449ebd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -96,22 +96,29 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    BlockAccessV2 = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = true,
-    RawAccess = false
+    IsAligned         = true,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = true,
+    RawAccess         = false
   };
 
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
+      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
   {
     // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
     // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
@@ -212,6 +219,214 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     return cost;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.lastLevelCacheSize() / sizeof(Scalar));
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::kSkewedInnerDims, block_total_size_max));
+
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    eigen_assert(m_impl.data() != NULL);
+
+    // Check if we can reuse `desc` destination, or allocate new scratch buffer.
+    ScalarNoConst* materialized_output =
+        desc.template destination<ScalarNoConst, Layout>();
+
+    bool materialized_in_output;
+    if (materialized_output != NULL) {
+      desc.DropDestinationBuffer();
+      materialized_in_output = true;
+
+    } else {
+      const size_t materialized_output_size = desc.size() * sizeof(Scalar);
+      void* output_scratch_mem = scratch.allocate(materialized_output_size);
+      materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem);
+      materialized_in_output = false;
+    }
+
+    static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
+
+    Index offset = desc.offset();
+
+    // Compute offsets in the output tensor corresponding to the desc.offset().
+    DSizes<Index, NumDims> output_offsets;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      const int stride_dim = IsColMajor ? dim : dim + 1;
+      output_offsets[dim] = offset / m_outputStrides[stride_dim];
+      offset -= output_offsets[dim] * m_outputStrides[stride_dim];
+    }
+    output_offsets[IsColMajor ? 0 : NumDims - 1] = offset;
+
+    // Offsets in the input corresponding to output offsets.
+    DSizes<Index, NumDims> input_offsets = output_offsets;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
+    }
+
+    // Compute offset in the input buffer (at this point it might be illegal and
+    // point outside of the input buffer, because we don't check for negative
+    // offsets, it will be autocorrected in the block iteration loop below).
+    Index input_offset = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offset += input_offsets[dim] * m_inputStrides[dim];
+    }
+
+    // Destination buffer and scratch buffer both indexed from 0 and have the
+    // same dimensions as the requested block (for destination buffer this
+    // property is guaranteed by `desc.destination()`).
+    Index output_offset = 0;
+    const DSizes<Index, NumDims> output_strides =
+        internal::strides<Layout>(desc.dimensions());
+
+    // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
+    // dimensions, skipping innermost dimension. In theory it should be possible
+    // to squeeze matching innermost dimensions, however in practice that did
+    // not show any improvements in benchmarks. Also in practice first outer
+    // dimension usually has padding, and will prevent squeezing.
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims - 1> it;
+    for (Index i = 0; i < NumDims - 1; ++i) {
+      const Index dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      it[i].count = 0;
+      it[i].size = desc.dimension(dim);
+
+      it[i].input_stride = m_inputStrides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      it[i].output_stride = output_strides[dim];
+      it[i].output_span = it[i].output_stride * (it[i].size - 1);
+    }
+
+    const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
+
+    // Total output size.
+    const Index output_size = desc.size();
+
+    // We will fill inner dimension of this size in the output. It might be
+    // larger than the inner dimension in the input, so we might have to pad
+    // before/after we copy values from the input inner dimension.
+    const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
+
+    // How many values to fill with padding BEFORE reading from the input inner
+    // dimension.
+    const Index output_inner_pad_before_size =
+        input_offsets[inner_dim_idx] < 0
+            ? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
+                           output_inner_dim_size)
+            : 0;
+
+    // How many values we can actually copy from the input inner dimension.
+    const Index output_inner_copy_size = numext::mini(
+        // Want to copy from input.
+        (output_inner_dim_size - output_inner_pad_before_size),
+        // Can copy from input.
+        (static_cast<Index>(m_impl.dimensions()[inner_dim_idx]) -
+         numext::maxi(input_offsets[inner_dim_idx], Index(0))));
+
+    // How many values to fill with padding AFTER reading from the input inner
+    // dimension.
+    const Index output_inner_pad_after_size =
+        (output_inner_dim_size - output_inner_copy_size -
+         output_inner_pad_before_size);
+
+    // Sanity check, sum of all sizes must be equal to the output size.
+    eigen_assert(output_inner_dim_size ==
+                 (output_inner_pad_before_size + output_inner_copy_size +
+                  output_inner_pad_after_size));
+
+    // Keep track of current coordinates and padding in the output.
+    DSizes<Index, NumDims> output_coord = output_offsets;
+    DSizes<Index, NumDims> output_padded;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+    }
+
+    typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
+
+    // Iterate copying data from `m_impl.data()` to the output buffer.
+    for (Index size = 0; size < output_size; size += output_inner_dim_size) {
+      // Detect if we are in the padded region (exclude innermost dimension).
+      bool is_padded = false;
+      for (int j = 1; j < NumDims; ++j) {
+        const int dim = IsColMajor ? j : NumDims - j - 1;
+        is_padded = output_padded[dim];
+        if (is_padded) break;
+      }
+
+      if (is_padded) {
+        // Fill with padding value.
+        LinCopy::template Run<LinCopy::Kind::FillLinear>(
+            typename LinCopy::Dst(output_offset, 1, materialized_output),
+            typename LinCopy::Src(0, 0, &m_paddingValue),
+            output_inner_dim_size);
+
+      } else {
+        {  // Fill with padding before copying from input inner dimension.
+          const Index out = output_offset;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(
+              typename LinCopy::Dst(out, 1, materialized_output),
+              typename LinCopy::Src(0, 0, &m_paddingValue),
+              output_inner_pad_before_size);
+        }
+
+        {  // Copy data from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size;
+          const Index in = input_offset + output_inner_pad_before_size;
+
+          LinCopy::template Run<LinCopy::Kind::Linear>(
+              typename LinCopy::Dst(out, 1, materialized_output),
+              typename LinCopy::Src(in, 1, m_impl.data()),
+              output_inner_copy_size);
+        }
+
+        {  // Fill with padding after copying from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size +
+                            output_inner_copy_size;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(
+              typename LinCopy::Dst(out, 1, materialized_output),
+              typename LinCopy::Src(0, 0, &m_paddingValue),
+              output_inner_pad_after_size);
+        }
+      }
+
+      for (int j = 0; j < NumDims - 1; ++j) {
+        const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
+
+        if (++it[j].count < it[j].size) {
+          input_offset += it[j].input_stride;
+          output_offset += it[j].output_stride;
+          output_coord[dim] += 1;
+          output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+          break;
+        }
+        it[j].count = 0;
+        input_offset -= it[j].input_span;
+        output_offset -= it[j].output_span;
+        output_coord[dim] -= it[j].size - 1;
+        output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+      }
+    }
+
+    return TensorBlockV2(materialized_in_output
+                         ? internal::TensorBlockKind::kMaterializedInOutput
+                         : internal::TensorBlockKind::kMaterializedInScratch,
+                         materialized_output,
+                         desc.dimensions());
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
 
 #ifdef EIGEN_USE_SYCL
@@ -222,6 +437,23 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 #endif
 
  private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : count(0),
+          size(0),
+          input_stride(0),
+          input_span(0),
+          output_stride(0),
+          output_span(0) {}
+
+    Index count;
+    Index size;
+    Index input_stride;
+    Index input_span;
+    Index output_stride;
+    Index output_span;
+  };
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
       Index index, int dim_index) const {
 #if defined(EIGEN_HAS_INDEX_LIST)
@@ -410,6 +642,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   PaddingDimensions m_padding;
 
   Scalar m_paddingValue;
+
+  const Device EIGEN_DEVICE_REF m_device;
 };
author	Eugene Zhulenev <ezhulenev@google.com>	2019-10-02 12:44:06 -0700
committer	Eugene Zhulenev <ezhulenev@google.com>	2019-10-02 12:44:06 -0700
commit	60ae24ee1a6c16114de456d77fcfba6f5a1160ca (patch)
tree	7b9d5463018055571a5050ca31a8d3df12a3e6fc /unsupported/Eigen/CXX11/src
parent	6e40454a6e6cc57c07c7340148657c985ca6c928 (diff)