aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2018-07-31 15:56:31 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2018-07-31 15:56:31 -0700
commit83c0a16baf5ecac6288cd9b74536a82de8985b31 (patch)
treefc284a18d1a457eecfb2ce4aeda26bd0c78f2056 /unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
parentd6568425f8b384ef36da1f16be76a3977fb90846 (diff)
Add block evaluation support to TensorOps
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h437
1 files changed, 431 insertions, 6 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 375fc0802..05c0990dc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -356,6 +356,70 @@ template <int NPT, typename S, typename R, typename I>
__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
#endif
+template <typename Self, typename Op,
+ bool Vectorizable =
+ (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess)>
+class BlockReducer {
+ public:
+ typedef typename Self::Index Index;
+ typedef typename Self::Scalar Scalar;
+ typedef typename Self::CoeffReturnType CoeffReturnType;
+ typedef typename Self::PacketReturnType PacketReturnType;
+ explicit BlockReducer(const Op& reducer) : op_(reducer) {
+ accum_ = op_.initialize();
+ }
+ void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+ for (Index i = 0; i < num_values_to_reduce; ++i) {
+ op_.reduce(data[index + i], &accum_);
+ }
+ }
+ CoeffReturnType Finalize() { return op_.finalize(accum_); }
+ PacketReturnType FinalizePacket() {
+ // TODO(andydavis) This function should not be called for Scalar
+ // reductions: clean this up or add an assert here.
+ return PacketReturnType();
+ }
+
+ private:
+ CoeffReturnType accum_;
+ Op op_;
+};
+
+template <typename Self, typename Op>
+class BlockReducer<Self, Op, true> {
+ public:
+ typedef typename Self::Index Index;
+ typedef typename Self::Scalar Scalar;
+ typedef typename Self::CoeffReturnType CoeffReturnType;
+ typedef typename Self::PacketReturnType PacketReturnType;
+ static const Index PacketSize =
+ internal::unpacket_traits<PacketReturnType>::size;
+
+ explicit BlockReducer(const Op& reducer) : op_(reducer) {
+ vaccum_ = op_.template initializePacket<PacketReturnType>();
+ accum_ = op_.initialize();
+ }
+ void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
+ const Index vectorized_size =
+ (num_values_to_reduce / PacketSize) * PacketSize;
+ for (Index i = 0; i < vectorized_size; i += PacketSize) {
+ op_.reducePacket(
+ internal::ploadt<PacketReturnType, Unaligned>(&data[index + i]),
+ &vaccum_);
+ }
+ for (Index i = vectorized_size; i < num_values_to_reduce; ++i) {
+ op_.reduce(data[index + i], &accum_);
+ }
+ }
+ CoeffReturnType Finalize() { return op_.finalizeBoth(accum_, vaccum_); }
+ PacketReturnType FinalizePacket() { return op_.finalizePacket(vaccum_); }
+
+ private:
+ PacketReturnType vaccum_;
+ CoeffReturnType accum_;
+ Op op_;
+};
+
} // end namespace internal
@@ -394,6 +458,7 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
{
+ typedef internal::reducer_traits<Op, Device> ReducerTraits;
typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
typedef typename XprType::Index Index;
typedef ArgType ChildType;
@@ -410,14 +475,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
- IsAligned = false,
+ IsAligned = false,
PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
- BlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
};
+ using ScalarNoConst = typename internal::remove_const<Scalar>::type;
+
+ using OutputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumOutputDims, Layout>;
+ using InputTensorBlock = internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>;
+
static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool RunningFullReduction = (NumOutputDims==0);
@@ -451,11 +521,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
m_outputStrides[0] = 1;
for (int i = 1; i < NumOutputDims; ++i) {
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
} else {
- m_outputStrides.back() = 1;
+ m_outputStrides[NumOutputDims - 1] = 1;
for (int i = NumOutputDims - 2; i >= 0; --i) {
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
}
}
@@ -483,6 +555,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
++reduceIndex;
} else {
m_preservedStrides[outputIndex] = input_strides[i];
+ m_output_to_input_dim_map[outputIndex] = i;
++outputIndex;
}
}
@@ -492,6 +565,16 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
if (NumOutputDims == 0) {
m_preservedStrides[0] = internal::array_prod(input_dims);
}
+
+ m_numValuesToReduce =
+ NumOutputDims == 0
+ ? internal::array_prod(input_dims)
+ : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+ ? m_preservedStrides[0]
+ : m_preservedStrides[NumOutputDims - 1];
+
+ m_block_total_size_max =
+ numext::maxi<Index>(1, device.lastLevelCacheSize() / sizeof(Scalar));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -686,6 +769,265 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ resources->push_back(internal::TensorOpResourceRequirements(
+ internal::TensorBlockShapeType::kSkewedInnerDims,
+ m_block_total_size_max));
+ m_impl.getResourceRequirements(resources);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block(
+ OutputTensorBlock* output_block) const {
+ // Special case full reductions to avoid input block copy below.
+ if (NumInputDims == NumReducedDims) {
+ eigen_assert(output_block->first_coeff_index() == 0);
+ eigen_assert(output_block->block_sizes().TotalSize() == 1);
+ Op reducer(m_reducer);
+ output_block->data()[0] = internal::InnerMostDimReducer<Self, Op>::reduce(
+ *this, 0, m_numValuesToReduce, reducer);
+ return;
+ }
+
+ // Calculate input tensor 'slice' required to reduce output block coeffs.
+ DSizes<Index, NumInputDims> input_slice_sizes(m_impl.dimensions());
+ for (int i = 0; i < NumOutputDims; ++i) {
+ // Clip preserved input dimensions by output block size.
+ input_slice_sizes[m_output_to_input_dim_map[i]] =
+ output_block->block_sizes()[i];
+ }
+
+ // Shard input tensor slice into blocks (because it could be large if we
+ // need to reduce along several dimensions to calculate required output
+ // coefficients).
+ const Index max_coeff_count =
+ numext::mini<Index>(((m_device.firstLevelCacheSize()) / sizeof(Scalar)),
+ input_slice_sizes.TotalSize());
+
+ // Calculate max output shard size needed to keep working set of reducers
+ // in L1, while leaving enough space for reducer overhead and 'PacketSize'
+ // reductions.
+ DSizes<Index, NumInputDims> target_input_block_sizes;
+ CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes,
+ &target_input_block_sizes);
+ // Calculate indices for first preserved dimension.
+ const Index first_preserved_dim_output_index =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumOutputDims - 1;
+ const Index first_preserved_dim_input_index =
+ m_output_to_input_dim_map[first_preserved_dim_output_index];
+ const bool inner_most_dim_preserved =
+ first_preserved_dim_input_index ==
+ (static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumInputDims - 1) |
+ PreservingInnerMostDims;
+
+ // Calculate output block inner/outer dimension sizes.
+ const Index output_block_inner_dim_size =
+ output_block->block_sizes()[first_preserved_dim_output_index];
+ const Index output_block_outer_dim_size =
+ output_block->block_sizes().TotalSize() / output_block_inner_dim_size;
+ // Calculate shard size for first preserved dimension.
+ const Index output_shard_size =
+ target_input_block_sizes[first_preserved_dim_input_index];
+ const Index num_output_shards =
+ (output_block_inner_dim_size + output_shard_size - 1) /
+ output_shard_size;
+
+ // Initialize 'tensor_slice_offsets' from input coords of output index.
+ DSizes<Index, NumInputDims> tensor_slice_offsets;
+ GetInputCoordsForOutputIndex(output_block->first_coeff_index(),
+ &tensor_slice_offsets);
+
+ // Store tensor slice offset in first preserved dimension to be used
+ // to update tensor slice extents in loop below.
+ const Index first_preserved_dim_offset_start =
+ tensor_slice_offsets[first_preserved_dim_input_index];
+
+ array<BlockIteratorState, NumOutputDims> block_iter_state;
+
+ // Initialize state used to iterate through output coefficients
+ // and update 'tensor_slice_offsets' in outer preserved dims.
+ for (int i = 0; i < NumOutputDims - 1; ++i) {
+ const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? i + 1
+ : NumOutputDims - i - 2;
+ block_iter_state[i].input_dim = m_output_to_input_dim_map[dim];
+ block_iter_state[i].output_size = output_block->block_sizes()[dim];
+ block_iter_state[i].output_count = 0;
+ }
+
+ // Allocate input block memory.
+ ScalarNoConst* input_block_data = static_cast<ScalarNoConst*>(
+ m_device.allocate(max_coeff_count * sizeof(Scalar)));
+ // Allocate reducer memory.
+ const bool packet_reductions_enabled =
+ (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess);
+ const Index num_reducers =
+ (inner_most_dim_preserved && packet_reductions_enabled)
+ ? (output_shard_size / PacketSize + output_shard_size % PacketSize +
+ PacketSize)
+ : output_shard_size;
+ typedef internal::BlockReducer<Self, Op> BlockReducer;
+ BlockReducer* reducers = static_cast<BlockReducer*>(
+ m_device.allocate(num_reducers * sizeof(BlockReducer)));
+
+ InputDimensions input_tensor_dims(m_impl.dimensions());
+ for (Index output_outer_index = 0;
+ output_outer_index < output_block_outer_dim_size;
+ ++output_outer_index) {
+ for (Index output_shard_index = 0; output_shard_index < num_output_shards;
+ ++output_shard_index) {
+ // Initialize 'tensor_slice_extents' for this output shard.
+ DSizes<Index, NumInputDims> tensor_slice_extents(input_slice_sizes);
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (i == first_preserved_dim_input_index) {
+ // Clip first preserved dim size to output shard size.
+ tensor_slice_extents[i] = numext::mini(
+ output_shard_size,
+ input_slice_sizes[i] - (tensor_slice_offsets[i] -
+ first_preserved_dim_offset_start));
+
+ } else if (!m_reduced[i]) {
+ // Clip outer preserved dims to size 1, so that we reduce a
+ // contiguous set of output coefficients.
+ tensor_slice_extents[i] = 1;
+ }
+ }
+
+ // Intialize output coefficient reducers.
+ for (int i = 0; i < num_reducers; ++i) {
+ new (&reducers[i]) BlockReducer(m_reducer);
+ }
+
+ using TensorSliceBlockMapper =
+ internal::TensorSliceBlockMapper<ScalarNoConst, Index, NumInputDims,
+ Layout>;
+
+ // TODO(andydavis) Consider removing 'input_block_stride_order' if we
+ // find that scattered reads are not worth supporting in
+ // TensorSliceBlockMapper.
+ TensorSliceBlockMapper block_mapper(
+ input_tensor_dims, tensor_slice_offsets, tensor_slice_extents,
+ target_input_block_sizes, DimensionList<Index, NumInputDims>());
+
+ const Index num_outputs_to_update =
+ tensor_slice_extents[first_preserved_dim_input_index];
+ const Index preserved_dim_vector_reducer_count =
+ (inner_most_dim_preserved && packet_reductions_enabled)
+ ? num_outputs_to_update / PacketSize
+ : 0;
+ const Index preserved_dim_vector_coeff_count =
+ inner_most_dim_preserved
+ ? preserved_dim_vector_reducer_count * PacketSize
+ : 0;
+ const Index preserved_dim_reducer_limit =
+ (inner_most_dim_preserved && packet_reductions_enabled)
+ ? (preserved_dim_vector_reducer_count +
+ num_outputs_to_update % PacketSize)
+ : num_outputs_to_update;
+
+ const Index total_block_count = block_mapper.total_block_count();
+ for (Index b = 0; b < total_block_count; ++b) {
+ InputTensorBlock input_block =
+ block_mapper.GetBlockForIndex(b, input_block_data);
+ // Read.
+ m_impl.block(&input_block);
+
+ Index num_values_to_reduce = 1;
+ for (Index i = 0; i < NumInputDims; ++i) {
+ if (m_reduced[i]) {
+ num_values_to_reduce *= input_block.block_sizes()[i];
+ }
+ }
+ // Reduce.
+ if (inner_most_dim_preserved) {
+ const Index input_outer_dim_size =
+ input_block.block_sizes().TotalSize() / num_outputs_to_update;
+ for (Index input_outer_dim_index = 0;
+ input_outer_dim_index < input_outer_dim_size;
+ ++input_outer_dim_index) {
+ const Index input_outer_dim_base =
+ input_outer_dim_index * num_outputs_to_update;
+ for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
+ reducers[i].Reduce(input_outer_dim_base + i * PacketSize,
+ PacketSize, input_block.data());
+ }
+ const Index scalar_reducer_base =
+ input_outer_dim_base + preserved_dim_vector_coeff_count;
+ for (Index i = preserved_dim_vector_reducer_count;
+ i < preserved_dim_reducer_limit; ++i) {
+ reducers[i].Reduce(scalar_reducer_base + i -
+ preserved_dim_vector_reducer_count,
+ 1, input_block.data());
+ }
+ }
+ } else {
+ for (Index i = 0; i < num_outputs_to_update; ++i) {
+ reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce,
+ input_block.data());
+ }
+ }
+ }
+
+ // Finalize all reducers for this output shard.
+ const Index output_base_index =
+ output_outer_index * output_block_inner_dim_size +
+ output_shard_index * output_shard_size;
+ if (inner_most_dim_preserved) {
+ EIGEN_ALIGN_MAX
+ typename internal::remove_const<CoeffReturnType>::type
+ values[PacketSize];
+ for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) {
+ const Index reducer_base = output_base_index + i * PacketSize;
+ internal::pstore<CoeffReturnType, PacketReturnType>(
+ values, reducers[i].FinalizePacket());
+ for (Index j = 0; j < PacketSize; ++j) {
+ output_block->data()[reducer_base + j] = values[j];
+ }
+ }
+ const Index scalar_reducer_base =
+ output_base_index + preserved_dim_vector_coeff_count;
+
+ for (Index i = preserved_dim_vector_reducer_count;
+ i < preserved_dim_reducer_limit; ++i) {
+ output_block->data()[scalar_reducer_base + i -
+ preserved_dim_vector_reducer_count] =
+ reducers[i].Finalize();
+ }
+ } else {
+ for (int i = 0; i < num_outputs_to_update; ++i) {
+ output_block->data()[output_base_index + i] =
+ reducers[i].Finalize();
+ }
+ }
+
+ // Update 'tensor_slice_offsets' by num outputs for this output shard.
+ tensor_slice_offsets[first_preserved_dim_input_index] +=
+ num_outputs_to_update;
+ }
+ // Update slice offset for inner preserved dim.
+ tensor_slice_offsets[first_preserved_dim_input_index] -=
+ output_block_inner_dim_size;
+ // Update slice offsets for remaining output dims.
+ for (int i = 0; i < NumOutputDims - 1; ++i) {
+ BlockIteratorState& b = block_iter_state[i];
+ if (++b.output_count < b.output_size) {
+ ++tensor_slice_offsets[b.input_dim];
+ break;
+ }
+ b.output_count = 0;
+ tensor_slice_offsets[b.input_dim] -= b.output_size - 1;
+ }
+ }
+
+ // Free memory.
+ m_device.deallocate(input_block_data);
+ m_device.deallocate(reducers);
+ }
+
EIGEN_DEVICE_FUNC typename MakePointer_<CoeffReturnType>::Type data() const { return m_result; }
#if defined(EIGEN_USE_SYCL)
@@ -722,6 +1064,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
+ struct BlockIteratorState {
+ Index input_dim;
+ Index output_size;
+ Index output_count;
+ };
+
// Returns the Index in the input tensor of the first value that needs to be
// used to compute the reduction at output index "index".
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -764,16 +1112,90 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
return startInput;
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex(
+ Index index,
+ DSizes<Index, NumInputDims>* coords) const {
+ for (int i = 0; i < NumInputDims; ++i) {
+ (*coords)[i] = 0;
+ }
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumOutputDims - 1; i > 0; --i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ (*coords)[m_output_to_input_dim_map[i]] = idx;
+ index -= idx * m_outputStrides[i];
+ }
+ (*coords)[m_output_to_input_dim_map[0]] = index;
+ } else {
+ for (int i = 0; i < NumOutputDims - 1; ++i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ (*coords)[m_output_to_input_dim_map[i]] = idx;
+ index -= idx * m_outputStrides[i];
+ }
+ (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape(
+ const Index max_coeff_count,
+ const DSizes<Index, NumInputDims>& input_slice_sizes,
+ DSizes<Index, NumInputDims>* target_input_block_sizes) const {
+ typedef typename internal::packet_traits<Scalar>::type Packet;
+ typedef internal::BlockReducer<Self, Op> BlockReducer;
+ // TODO(andydavis) Compute reducer overhead correctly for the case where
+ // we are preserving the inner most dimension, and a single reducer
+ // reduces a packet's worth of output coefficients.
+ const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar);
+
+ Index coeff_to_allocate = max_coeff_count;
+ bool first_preserved_dim_allocated = false;
+ bool first_reduced_dim_allocated = false;
+ for (int i = 0; i < NumInputDims; ++i) {
+ const int dim = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? i
+ : NumInputDims - i - 1;
+ (*target_input_block_sizes)[dim] = 1;
+ if (m_reduced[dim]) {
+ // TODO(andydavis) Consider allocating to multiple reduced dimensions.
+ // Watch out for cases where reduced dimensions are not contiguous,
+ // which induces scattered reads.
+ if (!first_reduced_dim_allocated) {
+ (*target_input_block_sizes)[dim] =
+ numext::mini(input_slice_sizes[dim], coeff_to_allocate);
+ coeff_to_allocate /= (*target_input_block_sizes)[dim];
+ first_reduced_dim_allocated = true;
+ }
+ } else if (!first_preserved_dim_allocated) {
+ // TODO(andydavis) Include output block size in this L1 working set
+ // calculation.
+ const Index allocated = max_coeff_count - coeff_to_allocate;
+ const Index alloc_size = numext::maxi(
+ static_cast<Index>(1), coeff_to_allocate / reducer_overhead);
+ (*target_input_block_sizes)[dim] =
+ numext::mini(input_slice_sizes[dim], alloc_size);
+ coeff_to_allocate = numext::maxi(
+ static_cast<Index>(1),
+ coeff_to_allocate /
+ ((*target_input_block_sizes)[dim] * reducer_overhead));
+ first_preserved_dim_allocated = true;
+ }
+ }
+ }
+
// Bitmap indicating if an input dimension is reduced or not.
array<bool, NumInputDims> m_reduced;
// Dimensions of the output of the operation.
Dimensions m_dimensions;
// Precomputed strides for the output tensor.
array<Index, NumOutputDims> m_outputStrides;
+ array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
// Subset of strides of the input tensor for the non-reduced dimensions.
// Indexed by output dimensions.
static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
array<Index, NumPreservedStrides> m_preservedStrides;
+ // Map from output to input dimension index.
+ array<Index, NumOutputDims> m_output_to_input_dim_map;
+ // How many values go into each reduction
+ Index m_numValuesToReduce;
// Subset of strides of the input tensor for the reduced dimensions.
// Indexed by reduced dimensions.
@@ -782,6 +1204,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
// Indexed by reduced dimensions.
array<Index, NumReducedDims> m_reducedDims;
+ // Block size for tiled (aka TensorBlock) evaluation.
+ Index m_block_total_size_max;
+
// Evaluator for the input expression.
TensorEvaluator<ArgType, Device> m_impl;