From 83c0a16baf5ecac6288cd9b74536a82de8985b31 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 31 Jul 2018 15:56:31 -0700 Subject: Add block evaluation support to TensorOps --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 437 ++++++++++++++++++++- 1 file changed, 431 insertions(+), 6 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 375fc0802..05c0990dc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -356,6 +356,70 @@ template __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif +template +class BlockReducer { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + explicit BlockReducer(const Op& reducer) : op_(reducer) { + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + for (Index i = 0; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { return op_.finalize(accum_); } + PacketReturnType FinalizePacket() { + // TODO(andydavis) This function should not be called for Scalar + // reductions: clean this up or add an assert here. + return PacketReturnType(); + } + + private: + CoeffReturnType accum_; + Op op_; +}; + +template +class BlockReducer { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + static const Index PacketSize = + internal::unpacket_traits::size; + + explicit BlockReducer(const Op& reducer) : op_(reducer) { + vaccum_ = op_.template initializePacket(); + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + const Index vectorized_size = + (num_values_to_reduce / PacketSize) * PacketSize; + for (Index i = 0; i < vectorized_size; i += PacketSize) { + op_.reducePacket( + internal::ploadt(&data[index + i]), + &vaccum_); + } + for (Index i = vectorized_size; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { return op_.finalizeBoth(accum_, vaccum_); } + PacketReturnType FinalizePacket() { return op_.finalizePacket(vaccum_); } + + private: + PacketReturnType vaccum_; + CoeffReturnType accum_; + Op op_; +}; + } // end namespace internal @@ -394,6 +458,7 @@ class TensorReductionOp : public TensorBase class MakePointer_, typename Device> struct TensorEvaluator, Device> { + typedef internal::reducer_traits ReducerTraits; typedef TensorReductionOp XprType; typedef typename XprType::Index Index; typedef ArgType ChildType; @@ -410,14 +475,19 @@ struct TensorEvaluator, static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = Self::InputPacketAccess && Op::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using OutputTensorBlock = internal::TensorBlock; + using InputTensorBlock = internal::TensorBlock; + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; static const bool RunningFullReduction = (NumOutputDims==0); @@ -451,11 +521,13 @@ struct TensorEvaluator, m_outputStrides[0] = 1; for (int i = 1; i < NumOutputDims; ++i) { m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } else { - m_outputStrides.back() = 1; + m_outputStrides[NumOutputDims - 1] = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } } @@ -483,6 +555,7 @@ struct TensorEvaluator, ++reduceIndex; } else { m_preservedStrides[outputIndex] = input_strides[i]; + m_output_to_input_dim_map[outputIndex] = i; ++outputIndex; } } @@ -492,6 +565,16 @@ struct TensorEvaluator, if (NumOutputDims == 0) { m_preservedStrides[0] = internal::array_prod(input_dims); } + + m_numValuesToReduce = + NumOutputDims == 0 + ? internal::array_prod(input_dims) + : (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumOutputDims - 1]; + + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -686,6 +769,265 @@ struct TensorEvaluator, } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block( + OutputTensorBlock* output_block) const { + // Special case full reductions to avoid input block copy below. + if (NumInputDims == NumReducedDims) { + eigen_assert(output_block->first_coeff_index() == 0); + eigen_assert(output_block->block_sizes().TotalSize() == 1); + Op reducer(m_reducer); + output_block->data()[0] = internal::InnerMostDimReducer::reduce( + *this, 0, m_numValuesToReduce, reducer); + return; + } + + // Calculate input tensor 'slice' required to reduce output block coeffs. + DSizes input_slice_sizes(m_impl.dimensions()); + for (int i = 0; i < NumOutputDims; ++i) { + // Clip preserved input dimensions by output block size. + input_slice_sizes[m_output_to_input_dim_map[i]] = + output_block->block_sizes()[i]; + } + + // Shard input tensor slice into blocks (because it could be large if we + // need to reduce along several dimensions to calculate required output + // coefficients). + const Index max_coeff_count = + numext::mini(((m_device.firstLevelCacheSize()) / sizeof(Scalar)), + input_slice_sizes.TotalSize()); + + // Calculate max output shard size needed to keep working set of reducers + // in L1, while leaving enough space for reducer overhead and 'PacketSize' + // reductions. + DSizes target_input_block_sizes; + CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes, + &target_input_block_sizes); + // Calculate indices for first preserved dimension. + const Index first_preserved_dim_output_index = + static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumOutputDims - 1; + const Index first_preserved_dim_input_index = + m_output_to_input_dim_map[first_preserved_dim_output_index]; + const bool inner_most_dim_preserved = + first_preserved_dim_input_index == + (static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumInputDims - 1) | + PreservingInnerMostDims; + + // Calculate output block inner/outer dimension sizes. + const Index output_block_inner_dim_size = + output_block->block_sizes()[first_preserved_dim_output_index]; + const Index output_block_outer_dim_size = + output_block->block_sizes().TotalSize() / output_block_inner_dim_size; + // Calculate shard size for first preserved dimension. + const Index output_shard_size = + target_input_block_sizes[first_preserved_dim_input_index]; + const Index num_output_shards = + (output_block_inner_dim_size + output_shard_size - 1) / + output_shard_size; + + // Initialize 'tensor_slice_offsets' from input coords of output index. + DSizes tensor_slice_offsets; + GetInputCoordsForOutputIndex(output_block->first_coeff_index(), + &tensor_slice_offsets); + + // Store tensor slice offset in first preserved dimension to be used + // to update tensor slice extents in loop below. + const Index first_preserved_dim_offset_start = + tensor_slice_offsets[first_preserved_dim_input_index]; + + array block_iter_state; + + // Initialize state used to iterate through output coefficients + // and update 'tensor_slice_offsets' in outer preserved dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + 1 + : NumOutputDims - i - 2; + block_iter_state[i].input_dim = m_output_to_input_dim_map[dim]; + block_iter_state[i].output_size = output_block->block_sizes()[dim]; + block_iter_state[i].output_count = 0; + } + + // Allocate input block memory. + ScalarNoConst* input_block_data = static_cast( + m_device.allocate(max_coeff_count * sizeof(Scalar))); + // Allocate reducer memory. + const bool packet_reductions_enabled = + (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess); + const Index num_reducers = + (inner_most_dim_preserved && packet_reductions_enabled) + ? (output_shard_size / PacketSize + output_shard_size % PacketSize + + PacketSize) + : output_shard_size; + typedef internal::BlockReducer BlockReducer; + BlockReducer* reducers = static_cast( + m_device.allocate(num_reducers * sizeof(BlockReducer))); + + InputDimensions input_tensor_dims(m_impl.dimensions()); + for (Index output_outer_index = 0; + output_outer_index < output_block_outer_dim_size; + ++output_outer_index) { + for (Index output_shard_index = 0; output_shard_index < num_output_shards; + ++output_shard_index) { + // Initialize 'tensor_slice_extents' for this output shard. + DSizes tensor_slice_extents(input_slice_sizes); + for (int i = 0; i < NumInputDims; ++i) { + if (i == first_preserved_dim_input_index) { + // Clip first preserved dim size to output shard size. + tensor_slice_extents[i] = numext::mini( + output_shard_size, + input_slice_sizes[i] - (tensor_slice_offsets[i] - + first_preserved_dim_offset_start)); + + } else if (!m_reduced[i]) { + // Clip outer preserved dims to size 1, so that we reduce a + // contiguous set of output coefficients. + tensor_slice_extents[i] = 1; + } + } + + // Intialize output coefficient reducers. + for (int i = 0; i < num_reducers; ++i) { + new (&reducers[i]) BlockReducer(m_reducer); + } + + using TensorSliceBlockMapper = + internal::TensorSliceBlockMapper; + + // TODO(andydavis) Consider removing 'input_block_stride_order' if we + // find that scattered reads are not worth supporting in + // TensorSliceBlockMapper. + TensorSliceBlockMapper block_mapper( + input_tensor_dims, tensor_slice_offsets, tensor_slice_extents, + target_input_block_sizes, DimensionList()); + + const Index num_outputs_to_update = + tensor_slice_extents[first_preserved_dim_input_index]; + const Index preserved_dim_vector_reducer_count = + (inner_most_dim_preserved && packet_reductions_enabled) + ? num_outputs_to_update / PacketSize + : 0; + const Index preserved_dim_vector_coeff_count = + inner_most_dim_preserved + ? preserved_dim_vector_reducer_count * PacketSize + : 0; + const Index preserved_dim_reducer_limit = + (inner_most_dim_preserved && packet_reductions_enabled) + ? (preserved_dim_vector_reducer_count + + num_outputs_to_update % PacketSize) + : num_outputs_to_update; + + const Index total_block_count = block_mapper.total_block_count(); + for (Index b = 0; b < total_block_count; ++b) { + InputTensorBlock input_block = + block_mapper.GetBlockForIndex(b, input_block_data); + // Read. + m_impl.block(&input_block); + + Index num_values_to_reduce = 1; + for (Index i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + num_values_to_reduce *= input_block.block_sizes()[i]; + } + } + // Reduce. + if (inner_most_dim_preserved) { + const Index input_outer_dim_size = + input_block.block_sizes().TotalSize() / num_outputs_to_update; + for (Index input_outer_dim_index = 0; + input_outer_dim_index < input_outer_dim_size; + ++input_outer_dim_index) { + const Index input_outer_dim_base = + input_outer_dim_index * num_outputs_to_update; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + reducers[i].Reduce(input_outer_dim_base + i * PacketSize, + PacketSize, input_block.data()); + } + const Index scalar_reducer_base = + input_outer_dim_base + preserved_dim_vector_coeff_count; + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + reducers[i].Reduce(scalar_reducer_base + i - + preserved_dim_vector_reducer_count, + 1, input_block.data()); + } + } + } else { + for (Index i = 0; i < num_outputs_to_update; ++i) { + reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce, + input_block.data()); + } + } + } + + // Finalize all reducers for this output shard. + const Index output_base_index = + output_outer_index * output_block_inner_dim_size + + output_shard_index * output_shard_size; + if (inner_most_dim_preserved) { + EIGEN_ALIGN_MAX + typename internal::remove_const::type + values[PacketSize]; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + const Index reducer_base = output_base_index + i * PacketSize; + internal::pstore( + values, reducers[i].FinalizePacket()); + for (Index j = 0; j < PacketSize; ++j) { + output_block->data()[reducer_base + j] = values[j]; + } + } + const Index scalar_reducer_base = + output_base_index + preserved_dim_vector_coeff_count; + + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + output_block->data()[scalar_reducer_base + i - + preserved_dim_vector_reducer_count] = + reducers[i].Finalize(); + } + } else { + for (int i = 0; i < num_outputs_to_update; ++i) { + output_block->data()[output_base_index + i] = + reducers[i].Finalize(); + } + } + + // Update 'tensor_slice_offsets' by num outputs for this output shard. + tensor_slice_offsets[first_preserved_dim_input_index] += + num_outputs_to_update; + } + // Update slice offset for inner preserved dim. + tensor_slice_offsets[first_preserved_dim_input_index] -= + output_block_inner_dim_size; + // Update slice offsets for remaining output dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + BlockIteratorState& b = block_iter_state[i]; + if (++b.output_count < b.output_size) { + ++tensor_slice_offsets[b.input_dim]; + break; + } + b.output_count = 0; + tensor_slice_offsets[b.input_dim] -= b.output_size - 1; + } + } + + // Free memory. + m_device.deallocate(input_block_data); + m_device.deallocate(reducers); + } + EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } #if defined(EIGEN_USE_SYCL) @@ -722,6 +1064,12 @@ struct TensorEvaluator, template friend struct internal::InnerReducer; + struct BlockIteratorState { + Index input_dim; + Index output_size; + Index output_count; + }; + // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -764,16 +1112,90 @@ struct TensorEvaluator, return startInput; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex( + Index index, + DSizes* coords) const { + for (int i = 0; i < NumInputDims; ++i) { + (*coords)[i] = 0; + } + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[0]] = index; + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape( + const Index max_coeff_count, + const DSizes& input_slice_sizes, + DSizes* target_input_block_sizes) const { + typedef typename internal::packet_traits::type Packet; + typedef internal::BlockReducer BlockReducer; + // TODO(andydavis) Compute reducer overhead correctly for the case where + // we are preserving the inner most dimension, and a single reducer + // reduces a packet's worth of output coefficients. + const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar); + + Index coeff_to_allocate = max_coeff_count; + bool first_preserved_dim_allocated = false; + bool first_reduced_dim_allocated = false; + for (int i = 0; i < NumInputDims; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : NumInputDims - i - 1; + (*target_input_block_sizes)[dim] = 1; + if (m_reduced[dim]) { + // TODO(andydavis) Consider allocating to multiple reduced dimensions. + // Watch out for cases where reduced dimensions are not contiguous, + // which induces scattered reads. + if (!first_reduced_dim_allocated) { + (*target_input_block_sizes)[dim] = + numext::mini(input_slice_sizes[dim], coeff_to_allocate); + coeff_to_allocate /= (*target_input_block_sizes)[dim]; + first_reduced_dim_allocated = true; + } + } else if (!first_preserved_dim_allocated) { + // TODO(andydavis) Include output block size in this L1 working set + // calculation. + const Index allocated = max_coeff_count - coeff_to_allocate; + const Index alloc_size = numext::maxi( + static_cast(1), coeff_to_allocate / reducer_overhead); + (*target_input_block_sizes)[dim] = + numext::mini(input_slice_sizes[dim], alloc_size); + coeff_to_allocate = numext::maxi( + static_cast(1), + coeff_to_allocate / + ((*target_input_block_sizes)[dim] * reducer_overhead)); + first_preserved_dim_allocated = true; + } + } + } + // Bitmap indicating if an input dimension is reduced or not. array m_reduced; // Dimensions of the output of the operation. Dimensions m_dimensions; // Precomputed strides for the output tensor. array m_outputStrides; + array, NumOutputDims> m_fastOutputStrides; // Subset of strides of the input tensor for the non-reduced dimensions. // Indexed by output dimensions. static const int NumPreservedStrides = max_n_1::size; array m_preservedStrides; + // Map from output to input dimension index. + array m_output_to_input_dim_map; + // How many values go into each reduction + Index m_numValuesToReduce; // Subset of strides of the input tensor for the reduced dimensions. // Indexed by reduced dimensions. @@ -782,6 +1204,9 @@ struct TensorEvaluator, // Indexed by reduced dimensions. array m_reducedDims; + // Block size for tiled (aka TensorBlock) evaluation. + Index m_block_total_size_max; + // Evaluator for the input expression. TensorEvaluator m_impl; -- cgit v1.2.3