From 60ae24ee1a6c16114de456d77fcfba6f5a1160ca Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 2 Oct 2019 12:44:06 -0700
Subject: Add block evaluation to
 TensorReshaping/TensorCasting/TensorPadding/TensorSelect

---
 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 254 ++++++++++++++++++++-
 1 file changed, 244 insertions(+), 10 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 7b9ad7374..be2449ebd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -96,22 +96,29 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    BlockAccessV2 = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = true,
-    RawAccess = false
+    IsAligned         = true,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    BlockAccessV2     = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = true,
+    RawAccess         = false
   };
 
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
+      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
   {
     // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
     // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
@@ -212,6 +219,214 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     return cost;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+      std::vector<internal::TensorOpResourceRequirements>* resources) const {
+    Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+        1, m_device.lastLevelCacheSize() / sizeof(Scalar));
+    resources->push_back(internal::TensorOpResourceRequirements(
+        internal::kSkewedInnerDims, block_total_size_max));
+
+    m_impl.getResourceRequirements(resources);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    eigen_assert(m_impl.data() != NULL);
+
+    // Check if we can reuse `desc` destination, or allocate new scratch buffer.
+    ScalarNoConst* materialized_output =
+        desc.template destination<ScalarNoConst, Layout>();
+
+    bool materialized_in_output;
+    if (materialized_output != NULL) {
+      desc.DropDestinationBuffer();
+      materialized_in_output = true;
+
+    } else {
+      const size_t materialized_output_size = desc.size() * sizeof(Scalar);
+      void* output_scratch_mem = scratch.allocate(materialized_output_size);
+      materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem);
+      materialized_in_output = false;
+    }
+
+    static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
+
+    Index offset = desc.offset();
+
+    // Compute offsets in the output tensor corresponding to the desc.offset().
+    DSizes<Index, NumDims> output_offsets;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      const int stride_dim = IsColMajor ? dim : dim + 1;
+      output_offsets[dim] = offset / m_outputStrides[stride_dim];
+      offset -= output_offsets[dim] * m_outputStrides[stride_dim];
+    }
+    output_offsets[IsColMajor ? 0 : NumDims - 1] = offset;
+
+    // Offsets in the input corresponding to output offsets.
+    DSizes<Index, NumDims> input_offsets = output_offsets;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
+    }
+
+    // Compute offset in the input buffer (at this point it might be illegal and
+    // point outside of the input buffer, because we don't check for negative
+    // offsets, it will be autocorrected in the block iteration loop below).
+    Index input_offset = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offset += input_offsets[dim] * m_inputStrides[dim];
+    }
+
+    // Destination buffer and scratch buffer both indexed from 0 and have the
+    // same dimensions as the requested block (for destination buffer this
+    // property is guaranteed by `desc.destination()`).
+    Index output_offset = 0;
+    const DSizes<Index, NumDims> output_strides =
+        internal::strides<Layout>(desc.dimensions());
+
+    // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
+    // dimensions, skipping innermost dimension. In theory it should be possible
+    // to squeeze matching innermost dimensions, however in practice that did
+    // not show any improvements in benchmarks. Also in practice first outer
+    // dimension usually has padding, and will prevent squeezing.
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims - 1> it;
+    for (Index i = 0; i < NumDims - 1; ++i) {
+      const Index dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      it[i].count = 0;
+      it[i].size = desc.dimension(dim);
+
+      it[i].input_stride = m_inputStrides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      it[i].output_stride = output_strides[dim];
+      it[i].output_span = it[i].output_stride * (it[i].size - 1);
+    }
+
+    const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
+
+    // Total output size.
+    const Index output_size = desc.size();
+
+    // We will fill inner dimension of this size in the output. It might be
+    // larger than the inner dimension in the input, so we might have to pad
+    // before/after we copy values from the input inner dimension.
+    const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
+
+    // How many values to fill with padding BEFORE reading from the input inner
+    // dimension.
+    const Index output_inner_pad_before_size =
+        input_offsets[inner_dim_idx] < 0
+            ? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
+                           output_inner_dim_size)
+            : 0;
+
+    // How many values we can actually copy from the input inner dimension.
+    const Index output_inner_copy_size = numext::mini(
+        // Want to copy from input.
+        (output_inner_dim_size - output_inner_pad_before_size),
+        // Can copy from input.
+        (static_cast<Index>(m_impl.dimensions()[inner_dim_idx]) -
+         numext::maxi(input_offsets[inner_dim_idx], Index(0))));
+
+    // How many values to fill with padding AFTER reading from the input inner
+    // dimension.
+    const Index output_inner_pad_after_size =
+        (output_inner_dim_size - output_inner_copy_size -
+         output_inner_pad_before_size);
+
+    // Sanity check, sum of all sizes must be equal to the output size.
+    eigen_assert(output_inner_dim_size ==
+                 (output_inner_pad_before_size + output_inner_copy_size +
+                  output_inner_pad_after_size));
+
+    // Keep track of current coordinates and padding in the output.
+    DSizes<Index, NumDims> output_coord = output_offsets;
+    DSizes<Index, NumDims> output_padded;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+    }
+
+    typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
+
+    // Iterate copying data from `m_impl.data()` to the output buffer.
+    for (Index size = 0; size < output_size; size += output_inner_dim_size) {
+      // Detect if we are in the padded region (exclude innermost dimension).
+      bool is_padded = false;
+      for (int j = 1; j < NumDims; ++j) {
+        const int dim = IsColMajor ? j : NumDims - j - 1;
+        is_padded = output_padded[dim];
+        if (is_padded) break;
+      }
+
+      if (is_padded) {
+        // Fill with padding value.
+        LinCopy::template Run<LinCopy::Kind::FillLinear>(
+            typename LinCopy::Dst(output_offset, 1, materialized_output),
+            typename LinCopy::Src(0, 0, &m_paddingValue),
+            output_inner_dim_size);
+
+      } else {
+        {  // Fill with padding before copying from input inner dimension.
+          const Index out = output_offset;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(
+              typename LinCopy::Dst(out, 1, materialized_output),
+              typename LinCopy::Src(0, 0, &m_paddingValue),
+              output_inner_pad_before_size);
+        }
+
+        {  // Copy data from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size;
+          const Index in = input_offset + output_inner_pad_before_size;
+
+          LinCopy::template Run<LinCopy::Kind::Linear>(
+              typename LinCopy::Dst(out, 1, materialized_output),
+              typename LinCopy::Src(in, 1, m_impl.data()),
+              output_inner_copy_size);
+        }
+
+        {  // Fill with padding after copying from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size +
+                            output_inner_copy_size;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(
+              typename LinCopy::Dst(out, 1, materialized_output),
+              typename LinCopy::Src(0, 0, &m_paddingValue),
+              output_inner_pad_after_size);
+        }
+      }
+
+      for (int j = 0; j < NumDims - 1; ++j) {
+        const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
+
+        if (++it[j].count < it[j].size) {
+          input_offset += it[j].input_stride;
+          output_offset += it[j].output_stride;
+          output_coord[dim] += 1;
+          output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+          break;
+        }
+        it[j].count = 0;
+        input_offset -= it[j].input_span;
+        output_offset -= it[j].output_span;
+        output_coord[dim] -= it[j].size - 1;
+        output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+      }
+    }
+
+    return TensorBlockV2(materialized_in_output
+                         ? internal::TensorBlockKind::kMaterializedInOutput
+                         : internal::TensorBlockKind::kMaterializedInScratch,
+                         materialized_output,
+                         desc.dimensions());
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
 
 #ifdef EIGEN_USE_SYCL
@@ -222,6 +437,23 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 #endif
 
  private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : count(0),
+          size(0),
+          input_stride(0),
+          input_span(0),
+          output_stride(0),
+          output_span(0) {}
+
+    Index count;
+    Index size;
+    Index input_stride;
+    Index input_span;
+    Index output_stride;
+    Index output_span;
+  };
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
       Index index, int dim_index) const {
 #if defined(EIGEN_HAS_INDEX_LIST)
@@ -410,6 +642,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   PaddingDimensions m_padding;
 
   Scalar m_paddingValue;
+
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
-- 
cgit v1.2.3