From a411e9f344a354673b72a490433cf3cc2148ddf1 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 10 Oct 2019 10:56:58 -0700
Subject: Block evaluation for TensorGenerator + TensorReverse + fixed bug in
 tensor reverse op

---
 unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 196 ++++++++++++++++++---
 1 file changed, 169 insertions(+), 27 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 855d04eb7..6e7abeb09 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -116,7 +116,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     IsAligned         = false,
     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess       = true,
-    BlockAccessV2     = false,
+    BlockAccessV2     = NumDims > 0,
     PreferBlockAccess = true,
     Layout            = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess       = false,  // to be implemented
@@ -130,7 +130,15 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
       OutputTensorBlock;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlockV2;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+                                                     Layout, Index>
+      TensorBlockV2;
   //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
@@ -240,17 +248,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
         internal::kSkewedInnerDims, block_total_size_max));
   }
 
-  struct BlockIteratorState {
-    Index block_size;
-    Index block_stride;
-    Index block_span;
-    Index input_size;
-    Index input_stride;
-    Index input_span;
-    Index count;
-    bool reverse;
-  };
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
       OutputTensorBlock* output_block) const {
     if (NumDims <= 0) return;
@@ -278,15 +275,16 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     array<BlockIteratorState, NumDims> it;
     for (Index i = 0; i < NumDims; ++i) {
       const Index dim = isColMajor ? i : NumDims - 1 - i;
-      it[i].block_size = output_block->block_sizes()[dim];
-      it[i].block_stride = output_block->block_strides()[dim];
-      it[i].block_span = it[i].block_stride * (it[i].block_size - 1);
-      it[i].input_size = m_dimensions[dim];
-      it[i].input_stride = m_strides[dim];
-      it[i].input_span = it[i].input_stride * (it[i].input_size - 1);
+      it[i].size = output_block->block_sizes()[dim];
       it[i].count = 0;
       it[i].reverse = m_reverse[dim];
 
+      it[i].block_stride = output_block->block_strides()[dim];
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+
+      it[i].input_stride = m_strides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
       if (it[i].reverse) {
         it[i].input_stride = -1 * it[i].input_stride;
         it[i].input_span = -1 * it[i].input_span;
@@ -298,17 +296,16 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     int effective_inner_dim = 0;
     for (int i = 1; i < NumDims; ++i) {
       if (it[i].reverse != it[effective_inner_dim].reverse) break;
-      if (it[i].block_stride != it[effective_inner_dim].input_size) break;
+      if (it[i].block_stride != it[effective_inner_dim].size) break;
       if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
 
-      it[i].block_size = it[effective_inner_dim].block_size * it[i].block_size;
-      it[i].input_size = it[effective_inner_dim].input_size * it[i].input_size;
+      it[i].size = it[effective_inner_dim].size * it[i].size;
 
       it[i].block_stride = 1;
       it[i].input_stride = (inner_dim_reversed ? -1 : 1);
 
-      it[i].block_span = it[i].block_stride * (it[i].block_size - 1);
-      it[i].input_span = it[i].input_stride * (it[i].input_size - 1);
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
 
       effective_inner_dim = i;
     }
@@ -317,9 +314,9 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     eigen_assert(it[effective_inner_dim].input_stride ==
                  (inner_dim_reversed ? -1 : 1));
 
-    const Index inner_dim_size = it[effective_inner_dim].block_size;
+    const Index inner_dim_size = it[effective_inner_dim].size;
 
-    while (it[NumDims - 1].count < it[NumDims - 1].block_size) {
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
       // Copy inner-most dimension data from reversed location in input.
       Index dst = block_offset;
       Index src = input_offset;
@@ -345,7 +342,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
 
       // Update offset.
       for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
-        if (++it[i].count < it[i].block_size) {
+        if (++it[i].count < it[i].size) {
           block_offset += it[i].block_stride;
           input_offset += it[i].input_stride;
           break;
@@ -357,6 +354,131 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     }
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+  blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+    // TODO(ezhulenev): If underlying tensor expression supports and prefers
+    // block evaluation we must use it. Currently we use coeff and packet
+    // access into the underlying tensor expression.
+    // static const bool useBlockAccessForArgType =
+    //     TensorEvaluator<ArgType, Device>::BlockAccess &&
+    //     TensorEvaluator<ArgType, Device>::PreferBlockAccess;
+
+    static const bool isColMajor =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
+    const bool inner_dim_reversed = m_reverse[inner_dim_idx];
+
+    // Try to reuse destination as an output block buffer.
+    CoeffReturnType* block_buffer = desc.template destination<CoeffReturnType, Layout>();
+    bool materialized_in_output;
+
+    if (block_buffer != NULL) {
+      materialized_in_output = true;
+
+    } else {
+      materialized_in_output = false;
+      void* mem = scratch.allocate(desc.size() * sizeof(CoeffReturnType));
+      block_buffer = static_cast<CoeffReturnType*>(mem);
+    }
+
+    // Offset in the output block.
+    Index block_offset = 0;
+
+    // Offset in the input Tensor.
+    Index input_offset = reverseIndex(desc.offset());
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = isColMajor ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].count = 0;
+      it[i].reverse = m_reverse[dim];
+
+      it[i].block_stride =
+          i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride);
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+
+      it[i].input_stride = m_strides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      if (it[i].reverse) {
+        it[i].input_stride = -1 * it[i].input_stride;
+        it[i].input_span = -1 * it[i].input_span;
+      }
+    }
+
+    // If multiple inner dimensions have the same reverse flag, check if we can
+    // merge them into a single virtual inner dimension.
+    int effective_inner_dim = 0;
+    for (int i = 1; i < NumDims; ++i) {
+      if (it[i].reverse != it[effective_inner_dim].reverse) break;
+      if (it[i].block_stride != it[effective_inner_dim].size) break;
+      if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
+
+      it[i].size = it[effective_inner_dim].size * it[i].size;
+
+      it[i].block_stride = 1;
+      it[i].input_stride = (inner_dim_reversed ? -1 : 1);
+
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      effective_inner_dim = i;
+    }
+
+    eigen_assert(it[effective_inner_dim].block_stride == 1);
+    eigen_assert(it[effective_inner_dim].input_stride ==
+                 (inner_dim_reversed ? -1 : 1));
+
+    const Index inner_dim_size = it[effective_inner_dim].size;
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      // Copy inner-most dimension data from reversed location in input.
+      Index dst = block_offset;
+      Index src = input_offset;
+
+      // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
+      // worse results in benchmarks than a simple coefficient loop.
+      if (inner_dim_reversed) {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          --src;
+        }
+      } else {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          ++src;
+        }
+      }
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if ((NumDims - effective_inner_dim) == 1) break;
+
+      // Update offset.
+      for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          block_offset += it[i].block_stride;
+          input_offset += it[i].input_stride;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        block_offset -= it[i].block_span;
+        input_offset -= it[i].input_span;
+      }
+    }
+
+    return TensorBlockV2(
+        materialized_in_output
+            ? internal::TensorBlockKind::kMaterializedInOutput
+            : internal::TensorBlockKind::kMaterializedInScratch,
+        block_buffer, desc.dimensions());
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
                                      2 * TensorOpCost::MulCost<Index>() +
@@ -386,6 +508,26 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   TensorEvaluator<ArgType, Device> m_impl;
   ReverseDimensions m_reverse;
   const Device EIGEN_DEVICE_REF m_device;
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : size(0),
+          count(0),
+          reverse(false),
+          block_stride(0),
+          block_span(0),
+          input_stride(0),
+          input_span(0) {}
+
+    Index size;
+    Index count;
+    bool reverse;
+    Index block_stride;
+    Index block_span;
+    Index input_stride;
+    Index input_span;
+  };
 };
 
 // Eval as lvalue
-- 
cgit v1.2.3