From ae07801dd8d295657f28b006e1e4999edf835052 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <eugene.zhulenev@gmail.com>
Date: Wed, 18 Dec 2019 20:07:00 +0000
Subject: Tensor block evaluation cost model

---
 unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h   |  95 ++++++++++++----
 .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h    |   8 +-
 .../Eigen/CXX11/src/Tensor/TensorChipping.h        |   6 +-
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h       |  23 ++--
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        |   7 +-
 .../Eigen/CXX11/src/Tensor/TensorGenerator.h       |   8 +-
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h        |   5 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h |   5 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h |  10 +-
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h       |  21 ++--
 unsupported/test/cxx11_tensor_block_access.cpp     | 119 ++++++++++-----------
 unsupported/test/cxx11_tensor_block_eval.cpp       |   3 +-
 unsupported/test/cxx11_tensor_block_io.cpp         |  10 +-
 13 files changed, 194 insertions(+), 126 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index dc9af3aa8..e89f40213 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -73,14 +73,68 @@ EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
 enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
 
 struct TensorBlockResourceRequirements {
-  TensorBlockShapeType shape_type;
-  size_t size;
+  TensorBlockShapeType shape_type;  // target block shape
+  size_t size;                      // target block size
+  TensorOpCost cost_per_coeff;      // cost of computing a single block element
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+      TensorBlockShapeType shape_type, size_t size_in_bytes,
+      TensorOpCost cost) {
+    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
+    return {shape_type, size, cost};
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+      TensorBlockShapeType shape_type, size_t size_in_bytes) {
+    // This default cost per coefficient is valid for most materialized tensor
+    // block evaluation implementations, because they typically just read
+    // coefficients from the underlying tensor storage, and write to the tensor
+    // block buffer (scratch or destination memory, reads and writes have linear
+    // access pattern). We ignore the fixed cost of block evaluation, because in
+    // practice it should negligible.
+    //
+    // Lazy block evaluation adds the cost of calling a functor for each
+    // coefficient.
+    //
+    // All non-trivial block evaluation implementations must provide their own
+    // cost approximation (e.g. shuffling inner dimension has a much higher cost
+    // because it reads memory randomly, although the total number of moved
+    // bytes is the same).
+    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
+                                    {/*bytes_loaded=*/sizeof(Scalar),
+                                     /*bytes_stored=*/sizeof(Scalar),
+                                     /*compute_cycles=*/0});
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
+      size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
+                                    size_in_bytes);
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
+      size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
+                                    size_in_bytes);
+  }
 
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
-  merge(const TensorBlockResourceRequirements &lhs,
-        const TensorBlockResourceRequirements &rhs) {
-    return {merge(lhs.shape_type, rhs.shape_type), merge(rhs.size, lhs.size)};
+  merge(const TensorBlockResourceRequirements& lhs,
+        const TensorBlockResourceRequirements& rhs) {
+    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
+            merge(lhs.size, rhs.size),                       // size
+            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
+  }
+
+  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
+      TensorOpCost cost) {
+    cost_per_coeff += cost;
+    return *this;
   }
 
   // This is a resource requirement that should be returned from expressions
@@ -88,10 +142,10 @@ struct TensorBlockResourceRequirements {
   // expression with raw buffer access).
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
-    return {TensorBlockShapeType::kUniformAllDims, 1};
+    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
   }
 
-private:
+ private:
   using Requirements = TensorBlockResourceRequirements;
 
   EIGEN_DEVICE_FUNC
@@ -100,13 +154,19 @@ private:
   }
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs,
-							  TensorBlockShapeType rhs) {
+  static EIGEN_STRONG_INLINE TensorBlockShapeType
+  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
     return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
             rhs == TensorBlockShapeType::kSkewedInnerDims)
                ? TensorBlockShapeType::kSkewedInnerDims
                : TensorBlockShapeType::kUniformAllDims;
   }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
+                                                TensorOpCost rhs_cost) {
+    return lhs_cost + rhs_cost;
+  }
 };
 
 // -------------------------------------------------------------------------- //
@@ -131,8 +191,9 @@ class TensorBlockDescriptor {
   class DestinationBuffer {
    public:
     enum DestinationBufferKind : int {
-      // The above explicit specification of "int" as the enum basetype is needed
-      // to get around a HIPCC link error ("the field type is not amp-compatible")
+      // The above explicit specification of "int" as the enum basetype is
+      // needed to get around a HIPCC link error ("the field type is not
+      // amp-compatible")
       // which is issued for class members with the enum type.
       // TODO(rocm):
       // remove the "int" basetype once HIPCC has been fixed to not error out
@@ -280,7 +341,7 @@ class TensorBlockMapper {
 
   TensorBlockMapper() = default;
   TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
-                      const TensorBlockResourceRequirements& requirements)
+                    const TensorBlockResourceRequirements& requirements)
       : m_tensor_dimensions(dimensions), m_requirements(requirements) {
     // Compute block dimensions and the total number of blocks.
     InitializeBlockDimensions();
@@ -299,8 +360,8 @@ class TensorBlockMapper {
     return m_block_dimensions;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  BlockDescriptor blockDescriptor(IndexType block_index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
+  blockDescriptor(IndexType block_index) const {
     static const bool isColMajor = Layout == static_cast<int>(ColMajor);
 
     IndexType offset = 0;
@@ -416,7 +477,7 @@ class TensorBlockMapper {
 
     eigen_assert(m_block_dimensions.TotalSize() >=
                  numext::mini<IndexType>(target_block_size,
-                                     m_tensor_dimensions.TotalSize()));
+                                         m_tensor_dimensions.TotalSize()));
 
     // Calculate block counts by dimension and total block count.
     DSizes<IndexType, NumDims> block_count;
@@ -761,7 +822,6 @@ class TensorMaterializedBlock {
 
 template <typename UnaryOp, typename ArgTensorBlock>
 class TensorCwiseUnaryBlock {
-
   static const bool NoArgBlockAccess =
       internal::is_void<typename ArgTensorBlock::XprType>::value;
 
@@ -793,7 +853,6 @@ class TensorCwiseUnaryBlock {
 
 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
 class TensorCwiseBinaryBlock {
-
   static const bool NoArgBlockAccess =
       internal::is_void<typename LhsTensorBlock::XprType>::value ||
       internal::is_void<typename RhsTensorBlock::XprType>::value;
@@ -840,7 +899,6 @@ class TensorCwiseBinaryBlock {
 
 template <typename BlockFactory, typename ArgTensorBlock>
 class TensorUnaryExprBlock {
-
   typedef typename ArgTensorBlock::XprType ArgXprType;
   static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
 
@@ -872,7 +930,6 @@ class TensorUnaryExprBlock {
 template <typename BlockFactory, typename Arg1TensorBlock,
           typename Arg2TensorBlock, typename Arg3TensorBlock>
 class TensorTernaryExprBlock {
-
   typedef typename Arg1TensorBlock::XprType Arg1XprType;
   typedef typename Arg2TensorBlock::XprType Arg2XprType;
   typedef typename Arg3TensorBlock::XprType Arg3XprType;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 620c8741c..3408f90d1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -620,12 +620,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   internal::TensorBlockResourceRequirements getResourceRequirements() const {
     // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
     // tensors. But this might need further tuning.
-    const size_t target_block_size = numext::maxi<size_t>(
-        1, m_device.firstLevelCacheSize() / sizeof(Scalar));
-
+    const size_t target_size = m_device.firstLevelCacheSize();
     return internal::TensorBlockResourceRequirements::merge(
-        {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size},
-        m_impl.getResourceRequirements());
+        m_impl.getResourceRequirements(),
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index f51a8559d..5b28e706d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -296,11 +296,9 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_block_size =
-        numext::maxi<size_t>(1, m_device.lastLevelCacheSize() / sizeof(Scalar));
-
+    const size_t target_size = m_device.lastLevelCacheSize();
     return internal::TensorBlockResourceRequirements::merge(
-        {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size},
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
         m_impl.getResourceRequirements());
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 146cc325e..d4532b72c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -521,7 +521,9 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return m_argImpl.getResourceRequirements();
+    static const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+    return m_argImpl.getResourceRequirements().addCostPerCoeff(
+        {0, 0, functor_cost / PacketSize});
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
@@ -654,9 +656,11 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
     return internal::TensorBlockResourceRequirements::merge(
-        m_leftImpl.getResourceRequirements(),
-        m_rightImpl.getResourceRequirements());
+               m_leftImpl.getResourceRequirements(),
+               m_rightImpl.getResourceRequirements())
+        .addCostPerCoeff({0, 0, functor_cost / PacketSize});
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
@@ -934,11 +938,16 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    auto then_req = m_thenImpl.getResourceRequirements();
+    auto else_req = m_elseImpl.getResourceRequirements();
+
+    auto merged_req =
+        internal::TensorBlockResourceRequirements::merge(then_req, else_req);
+    merged_req.cost_per_coeff =
+        then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff);
+
     return internal::TensorBlockResourceRequirements::merge(
-        m_condImpl.getResourceRequirements(),
-        internal::TensorBlockResourceRequirements::merge(
-            m_thenImpl.getResourceRequirements(),
-            m_elseImpl.getResourceRequirements()));
+        m_condImpl.getResourceRequirements(), merged_req);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index b90791d8d..93bab11b1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -245,8 +245,8 @@ TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
       evaluator.getResourceRequirements();
 
   // Update target block size based on cost model.
-  TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
-  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
+  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
+      1, requirements.cost_per_coeff);
   requirements.size = static_cast<size_t>(1.0 / taskSize);
 
   TensorBlockMapper block_mapper(
@@ -259,7 +259,8 @@ TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
       align *
       divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
 
-  return {block_mapper, cost * block_size, aligned_blocksize};
+  return {block_mapper, requirements.cost_per_coeff * block_size,
+          aligned_blocksize};
 }
 
 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index fb4b5e246..b1ff1d8b1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -166,10 +166,10 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_block_size = numext::maxi<size_t>(
-        1, m_device.firstLevelCacheSize() / sizeof(Scalar));
-    return {internal::TensorBlockShapeType::kSkewedInnerDims,
-            target_block_size};
+    const size_t target_size = m_device.firstLevelCacheSize();
+    // TODO(ezhulenev): Generator should have a cost.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
+        target_size);
   }
 
   struct BlockIteratorState {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 5c2036626..879a67ea4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -634,10 +634,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_block_size =
-        numext::maxi<size_t>(1, m_device.lastLevelCacheSize() / sizeof(Scalar));
+    const size_t target_size = m_device.lastLevelCacheSize();
     return internal::TensorBlockResourceRequirements::merge(
-        {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size},
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
         m_impl.getResourceRequirements());
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 201bea6bb..e070d0b93 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -229,10 +229,9 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_block_size =
-        numext::maxi<size_t>(1, m_device.lastLevelCacheSize() / sizeof(Scalar));
+    const size_t target_size = m_device.lastLevelCacheSize();
     return internal::TensorBlockResourceRequirements::merge(
-        {internal::TensorBlockShapeType::kSkewedInnerDims, target_block_size},
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
         m_impl.getResourceRequirements());
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index c4ac81db8..2fc85c13c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -246,10 +246,12 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_block_size =
-        numext::maxi<size_t>(1, m_device.lastLevelCacheSize() / sizeof(Scalar));
-    return {internal::TensorBlockShapeType::kSkewedInnerDims,
-            target_block_size};
+    const size_t target_size = m_device.lastLevelCacheSize();
+    // Block evaluation reads underlying memory in reverse order, and default
+    // cost model does not properly catch this in bytes stored/loaded.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
+               target_size)
+        .addCostPerCoeff({0, 0, 24});
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 1a6891ffd..597ca64cd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -249,14 +249,21 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     static const int inner_dim =
         Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
 
-    const size_t target_block_size = numext::maxi<size_t>(
-        1, m_device.firstLevelCacheSize() / sizeof(Scalar));
-
+    const size_t target_size = m_device.firstLevelCacheSize();
     const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
-    return {inner_dim_shuffled
-                ? internal::TensorBlockShapeType::kUniformAllDims
-                : internal::TensorBlockShapeType::kSkewedInnerDims,
-            target_block_size};
+
+    // Shuffled inner dimensions leads to a random memory access, which is not
+    // captured by default cost model bytes loaded/stored. We add this cost
+    // explicitly. The number of cycles picked based on the benchmarks.
+    // TODO(ezhulenev): This number was picked based on a very questionable
+    // benchmarks, add benchmarks that are representative of real workloads.
+    using BlockRequirements = internal::TensorBlockResourceRequirements;
+    if (inner_dim_shuffled) {
+      return BlockRequirements::uniform<Scalar>(target_size)
+          .addCostPerCoeff({0, 0, NumDims * 28});
+    } else {
+      return BlockRequirements::skewed<Scalar>(target_size);
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
index 33dc2535a..5fb12e0e0 100644
--- a/unsupported/test/cxx11_tensor_block_access.cpp
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -21,6 +21,7 @@ using Eigen::RowMajor;
 using Eigen::ColMajor;
 using Eigen::internal::TensorBlockShapeType;
 
+static TensorOpCost zeroCost() { return {0, 0, 0}; }
 
 template<typename T>
 static const T& choose(int layout, const T& col, const T& row) {
@@ -73,7 +74,7 @@ static void test_block_mapper_sanity()
 
   // Test uniform blocks.
   TensorBlockMapper uniform_block_mapper(
-      tensor_dims, {TensorBlockShapeType::kUniformAllDims, 100});
+      tensor_dims, {TensorBlockShapeType::kUniformAllDims, 100, zeroCost()});
 
   VERIFY_IS_EQUAL(uniform_block_mapper.blockCount(), 100);
   VERIFY_IS_EQUAL(uniform_block_mapper.blockTotalSize(), 100);
@@ -85,7 +86,7 @@ static void test_block_mapper_sanity()
 
   // Test skewed to inner dims blocks.
   TensorBlockMapper skewed_block_mapper(
-      tensor_dims, {TensorBlockShapeType::kSkewedInnerDims, 100});
+      tensor_dims, {TensorBlockShapeType::kSkewedInnerDims, 100, zeroCost()});
 
   VERIFY_IS_EQUAL(skewed_block_mapper.blockCount(), 100);
   VERIFY_IS_EQUAL(skewed_block_mapper.blockTotalSize(), 100);
@@ -130,7 +131,8 @@ static void test_block_mapper_maps_every_element() {
   std::set<Index> coeff_set;
 
   // Try different combinations of block types and sizes.
-  TensorBlockMapper block_mapper(dims, {RandomShape(), RandomTargetSize(dims)});
+  TensorBlockMapper block_mapper(
+      dims, {RandomShape(), RandomTargetSize(dims), zeroCost()});
 
   for (int i = 0; i < block_mapper.blockCount(); ++i) {
     auto block = block_mapper.blockDescriptor(i);
@@ -233,9 +235,8 @@ static void test_uniform_block_shape()
     // Test shape 'UniformAllDims' with uniform 'max_coeff count'.
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 5 * 5 * 5 * 5 * 5;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     for (int i = 0; i < 5; ++i) {
       VERIFY_IS_EQUAL(5, block.dimensions()[i]);
@@ -248,9 +249,8 @@ static void test_uniform_block_shape()
   if (Layout == ColMajor) {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 7 * 5 * 5 * 5 * 5;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[0]);
     for (int i = 1; i < 5; ++i) {
@@ -260,9 +260,8 @@ static void test_uniform_block_shape()
   } else {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 5 * 5 * 5 * 5 * 6;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(6, block.dimensions()[4]);
     for (int i = 3; i >= 0; --i) {
@@ -276,9 +275,8 @@ static void test_uniform_block_shape()
   if (Layout == ColMajor) {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 5 * 5 * 5 * 5;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(11, block.dimensions()[0]);
     for (int i = 1; i < 5; ++i) {
@@ -288,9 +286,8 @@ static void test_uniform_block_shape()
   } else {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 5 * 5 * 5 * 5 * 7;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
     for (int i = 3; i >= 0; --i) {
@@ -304,9 +301,8 @@ static void test_uniform_block_shape()
   if (Layout == ColMajor) {
     DSizes<Index, 5> dims(7, 5, 6, 17, 7);
     const Index max_coeff_count = 7 * 5 * 6 * 7 * 5;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[0]);
     VERIFY_IS_EQUAL(5, block.dimensions()[1]);
@@ -317,9 +313,8 @@ static void test_uniform_block_shape()
   } else {
     DSizes<Index, 5> dims(7, 5, 6, 9, 7);
     const Index max_coeff_count = 5 * 5 * 5 * 6 * 7;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
     VERIFY_IS_EQUAL(6, block.dimensions()[3]);
@@ -333,9 +328,8 @@ static void test_uniform_block_shape()
   if (Layout == ColMajor) {
     DSizes<Index, 5> dims(7, 5, 6, 17, 7);
     const Index max_coeff_count = 7 * 5 * 6 * 17 * 7;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[0]);
     VERIFY_IS_EQUAL(5, block.dimensions()[1]);
@@ -346,9 +340,8 @@ static void test_uniform_block_shape()
   } else {
     DSizes<Index, 5> dims(7, 5, 6, 9, 7);
     const Index max_coeff_count = 7 * 5 * 6 * 9 * 7;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
     VERIFY_IS_EQUAL(9, block.dimensions()[3]);
@@ -369,9 +362,9 @@ static void test_skewed_inner_dim_block_shape()
   if (Layout == ColMajor) {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 10 * 1 * 1 * 1 * 1;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(10, block.dimensions()[0]);
     for (int i = 1; i < 5; ++i) {
@@ -381,9 +374,9 @@ static void test_skewed_inner_dim_block_shape()
   } else {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 1 * 1 * 1 * 1 * 6;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(6, block.dimensions()[4]);
     for (int i = 3; i >= 0; --i) {
@@ -396,9 +389,9 @@ static void test_skewed_inner_dim_block_shape()
   if (Layout == ColMajor) {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 1 * 1 * 1 * 1;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(11, block.dimensions()[0]);
     for (int i = 1; i < 5; ++i) {
@@ -408,9 +401,9 @@ static void test_skewed_inner_dim_block_shape()
   } else {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 1 * 1 * 1 * 1 * 7;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
     for (int i = 3; i >= 0; --i) {
@@ -424,9 +417,9 @@ static void test_skewed_inner_dim_block_shape()
   if (Layout == ColMajor) {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 3 * 1 * 1 * 1;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(11, block.dimensions()[0]);
     VERIFY_IS_EQUAL(3, block.dimensions()[1]);
@@ -437,9 +430,9 @@ static void test_skewed_inner_dim_block_shape()
   } else {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 1 * 1 * 1 * 15 * 7;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
     VERIFY_IS_EQUAL(15, block.dimensions()[3]);
@@ -454,9 +447,9 @@ static void test_skewed_inner_dim_block_shape()
   if (Layout == ColMajor) {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 5 * 5 * 1 * 1;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(11, block.dimensions()[0]);
     VERIFY_IS_EQUAL(5, block.dimensions()[1]);
@@ -468,9 +461,9 @@ static void test_skewed_inner_dim_block_shape()
   } else {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 1 * 1 * 5 * 17 * 7;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
     VERIFY_IS_EQUAL(17, block.dimensions()[3]);
@@ -485,9 +478,9 @@ static void test_skewed_inner_dim_block_shape()
   if (Layout == ColMajor) {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(11, block.dimensions()[0]);
     VERIFY_IS_EQUAL(5, block.dimensions()[1]);
@@ -498,9 +491,9 @@ static void test_skewed_inner_dim_block_shape()
   } else {
     DSizes<Index, 5> dims(11, 5, 6, 17, 7);
     const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
-    TensorBlockMapper
-        block_mapper(dims, {TensorBlockShapeType::kSkewedInnerDims,
-                            max_coeff_count});
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
     TensorBlock block = block_mapper.blockDescriptor(0);
     VERIFY_IS_EQUAL(7, block.dimensions()[4]);
     VERIFY_IS_EQUAL(17, block.dimensions()[3]);
@@ -524,7 +517,8 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
 
     DSizes<Index, 1> dims(0);
     for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
-      TensorBlockMapper block_mapper(dims, {block_shape, max_coeff_count});
+      TensorBlockMapper block_mapper(
+          dims, {block_shape, max_coeff_count, zeroCost()});
       VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
       VERIFY(block_mapper.blockTotalSize() >= 1);
     }
@@ -537,7 +531,8 @@ static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
       for (int dim2 = 0; dim2 < 3; ++dim2) {
         DSizes<Index, 2> dims(dim1, dim2);
         for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
-          TensorBlockMapper block_mapper(dims, {block_shape, max_coeff_count});
+          TensorBlockMapper block_mapper(
+              dims, {block_shape, max_coeff_count, zeroCost()});
           if (dim1 * dim2 == 0) {
             VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
           }
diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
index 4a785dcdc..81f0c90da 100644
--- a/unsupported/test/cxx11_tensor_block_eval.cpp
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@@ -64,7 +64,8 @@ static TensorBlockParams<NumDims> SkewedInnerBlock(
   using BlockMapper = internal::TensorBlockMapper<NumDims, Layout, Index>;
   BlockMapper block_mapper(dims,
                            {internal::TensorBlockShapeType::kSkewedInnerDims,
-                            internal::random<size_t>(1, dims.TotalSize())});
+                            internal::random<size_t>(1, dims.TotalSize()),
+                            {0, 0, 0}});
 
   Index total_blocks = block_mapper.blockCount();
   Index block_index = internal::random<Index>(0, total_blocks - 1);
diff --git a/unsupported/test/cxx11_tensor_block_io.cpp b/unsupported/test/cxx11_tensor_block_io.cpp
index 25584433e..b8600eaea 100644
--- a/unsupported/test/cxx11_tensor_block_io.cpp
+++ b/unsupported/test/cxx11_tensor_block_io.cpp
@@ -75,8 +75,8 @@ static void test_block_io_copy_data_from_source_to_target() {
   // Construct a tensor block mapper.
   using TensorBlockMapper =
       internal::TensorBlockMapper<NumDims, Layout, Index>;
-  TensorBlockMapper block_mapper(dims, {RandomBlockShape(),
-                                        RandomTargetBlockSize(dims)});
+  TensorBlockMapper block_mapper(
+      dims, {RandomBlockShape(), RandomTargetBlockSize(dims), {0, 0, 0}});
 
   // We will copy data from input to output through this buffer.
   Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
@@ -146,8 +146,10 @@ static void test_block_io_copy_using_reordered_dimensions() {
   // NOTE: Tensor block mapper works with shuffled dimensions.
   using TensorBlockMapper =
       internal::TensorBlockMapper<NumDims, Layout, Index>;
-  TensorBlockMapper block_mapper(output_tensor_dims, {RandomBlockShape(),
-                                 RandomTargetBlockSize(output_tensor_dims)});
+  TensorBlockMapper block_mapper(output_tensor_dims,
+                                 {RandomBlockShape(),
+                                  RandomTargetBlockSize(output_tensor_dims),
+                                  {0, 0, 0}});
 
   // We will copy data from input to output through this buffer.
   Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
-- 
cgit v1.2.3