From ae07801dd8d295657f28b006e1e4999edf835052 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <eugene.zhulenev@gmail.com>
Date: Wed, 18 Dec 2019 20:07:00 +0000
Subject: Tensor block evaluation cost model

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 95 +++++++++++++++++++-----
 1 file changed, 76 insertions(+), 19 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index dc9af3aa8..e89f40213 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -73,14 +73,68 @@ EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
 enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
 
 struct TensorBlockResourceRequirements {
-  TensorBlockShapeType shape_type;
-  size_t size;
+  TensorBlockShapeType shape_type;  // target block shape
+  size_t size;                      // target block size
+  TensorOpCost cost_per_coeff;      // cost of computing a single block element
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+      TensorBlockShapeType shape_type, size_t size_in_bytes,
+      TensorOpCost cost) {
+    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
+    return {shape_type, size, cost};
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+      TensorBlockShapeType shape_type, size_t size_in_bytes) {
+    // This default cost per coefficient is valid for most materialized tensor
+    // block evaluation implementations, because they typically just read
+    // coefficients from the underlying tensor storage, and write to the tensor
+    // block buffer (scratch or destination memory, reads and writes have linear
+    // access pattern). We ignore the fixed cost of block evaluation, because in
+    // practice it should negligible.
+    //
+    // Lazy block evaluation adds the cost of calling a functor for each
+    // coefficient.
+    //
+    // All non-trivial block evaluation implementations must provide their own
+    // cost approximation (e.g. shuffling inner dimension has a much higher cost
+    // because it reads memory randomly, although the total number of moved
+    // bytes is the same).
+    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
+                                    {/*bytes_loaded=*/sizeof(Scalar),
+                                     /*bytes_stored=*/sizeof(Scalar),
+                                     /*compute_cycles=*/0});
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
+      size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
+                                    size_in_bytes);
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
+      size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
+                                    size_in_bytes);
+  }
 
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
-  merge(const TensorBlockResourceRequirements &lhs,
-        const TensorBlockResourceRequirements &rhs) {
-    return {merge(lhs.shape_type, rhs.shape_type), merge(rhs.size, lhs.size)};
+  merge(const TensorBlockResourceRequirements& lhs,
+        const TensorBlockResourceRequirements& rhs) {
+    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
+            merge(lhs.size, rhs.size),                       // size
+            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
+  }
+
+  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
+      TensorOpCost cost) {
+    cost_per_coeff += cost;
+    return *this;
   }
 
   // This is a resource requirement that should be returned from expressions
@@ -88,10 +142,10 @@ struct TensorBlockResourceRequirements {
   // expression with raw buffer access).
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
-    return {TensorBlockShapeType::kUniformAllDims, 1};
+    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
   }
 
-private:
+ private:
   using Requirements = TensorBlockResourceRequirements;
 
   EIGEN_DEVICE_FUNC
@@ -100,13 +154,19 @@ private:
   }
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs,
-							  TensorBlockShapeType rhs) {
+  static EIGEN_STRONG_INLINE TensorBlockShapeType
+  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
     return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
             rhs == TensorBlockShapeType::kSkewedInnerDims)
                ? TensorBlockShapeType::kSkewedInnerDims
                : TensorBlockShapeType::kUniformAllDims;
   }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
+                                                TensorOpCost rhs_cost) {
+    return lhs_cost + rhs_cost;
+  }
 };
 
 // -------------------------------------------------------------------------- //
@@ -131,8 +191,9 @@ class TensorBlockDescriptor {
   class DestinationBuffer {
    public:
     enum DestinationBufferKind : int {
-      // The above explicit specification of "int" as the enum basetype is needed
-      // to get around a HIPCC link error ("the field type is not amp-compatible")
+      // The above explicit specification of "int" as the enum basetype is
+      // needed to get around a HIPCC link error ("the field type is not
+      // amp-compatible")
       // which is issued for class members with the enum type.
       // TODO(rocm):
       // remove the "int" basetype once HIPCC has been fixed to not error out
@@ -280,7 +341,7 @@ class TensorBlockMapper {
 
   TensorBlockMapper() = default;
   TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
-                      const TensorBlockResourceRequirements& requirements)
+                    const TensorBlockResourceRequirements& requirements)
       : m_tensor_dimensions(dimensions), m_requirements(requirements) {
     // Compute block dimensions and the total number of blocks.
     InitializeBlockDimensions();
@@ -299,8 +360,8 @@ class TensorBlockMapper {
     return m_block_dimensions;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  BlockDescriptor blockDescriptor(IndexType block_index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
+  blockDescriptor(IndexType block_index) const {
     static const bool isColMajor = Layout == static_cast<int>(ColMajor);
 
     IndexType offset = 0;
@@ -416,7 +477,7 @@ class TensorBlockMapper {
 
     eigen_assert(m_block_dimensions.TotalSize() >=
                  numext::mini<IndexType>(target_block_size,
-                                     m_tensor_dimensions.TotalSize()));
+                                         m_tensor_dimensions.TotalSize()));
 
     // Calculate block counts by dimension and total block count.
     DSizes<IndexType, NumDims> block_count;
@@ -761,7 +822,6 @@ class TensorMaterializedBlock {
 
 template <typename UnaryOp, typename ArgTensorBlock>
 class TensorCwiseUnaryBlock {
-
   static const bool NoArgBlockAccess =
       internal::is_void<typename ArgTensorBlock::XprType>::value;
 
@@ -793,7 +853,6 @@ class TensorCwiseUnaryBlock {
 
 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
 class TensorCwiseBinaryBlock {
-
   static const bool NoArgBlockAccess =
       internal::is_void<typename LhsTensorBlock::XprType>::value ||
       internal::is_void<typename RhsTensorBlock::XprType>::value;
@@ -840,7 +899,6 @@ class TensorCwiseBinaryBlock {
 
 template <typename BlockFactory, typename ArgTensorBlock>
 class TensorUnaryExprBlock {
-
   typedef typename ArgTensorBlock::XprType ArgXprType;
   static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
 
@@ -872,7 +930,6 @@ class TensorUnaryExprBlock {
 template <typename BlockFactory, typename Arg1TensorBlock,
           typename Arg2TensorBlock, typename Arg3TensorBlock>
 class TensorTernaryExprBlock {
-
   typedef typename Arg1TensorBlock::XprType Arg1XprType;
   typedef typename Arg2TensorBlock::XprType Arg2XprType;
   typedef typename Arg3TensorBlock::XprType Arg3XprType;
-- 
cgit v1.2.3