From 60ae24ee1a6c16114de456d77fcfba6f5a1160ca Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 2 Oct 2019 12:44:06 -0700 Subject: Add block evaluation to TensorReshaping/TensorCasting/TensorPadding/TensorSelect --- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 254 ++++++++++++++++++++- 1 file changed, 244 insertions(+), 10 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 7b9ad7374..be2449ebd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -96,22 +96,29 @@ struct TensorEvaluator, Device typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - BlockAccessV2 = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = true, - RawAccess = false + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + BlockAccessV2 = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = true, + RawAccess = false }; + typedef typename internal::remove_const::type ScalarNoConst; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) { // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector @@ -212,6 +219,214 @@ struct TensorEvaluator, Device return cost; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + Eigen::Index block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, block_total_size_max)); + + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + eigen_assert(m_impl.data() != NULL); + + // Check if we can reuse `desc` destination, or allocate new scratch buffer. + ScalarNoConst* materialized_output = + desc.template destination(); + + bool materialized_in_output; + if (materialized_output != NULL) { + desc.DropDestinationBuffer(); + materialized_in_output = true; + + } else { + const size_t materialized_output_size = desc.size() * sizeof(Scalar); + void* output_scratch_mem = scratch.allocate(materialized_output_size); + materialized_output = static_cast(output_scratch_mem); + materialized_in_output = false; + } + + static const bool IsColMajor = Layout == static_cast(ColMajor); + + Index offset = desc.offset(); + + // Compute offsets in the output tensor corresponding to the desc.offset(). + DSizes output_offsets; + for (int i = NumDims - 1; i > 0; --i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + const int stride_dim = IsColMajor ? dim : dim + 1; + output_offsets[dim] = offset / m_outputStrides[stride_dim]; + offset -= output_offsets[dim] * m_outputStrides[stride_dim]; + } + output_offsets[IsColMajor ? 0 : NumDims - 1] = offset; + + // Offsets in the input corresponding to output offsets. + DSizes input_offsets = output_offsets; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offsets[dim] = input_offsets[dim] - m_padding[dim].first; + } + + // Compute offset in the input buffer (at this point it might be illegal and + // point outside of the input buffer, because we don't check for negative + // offsets, it will be autocorrected in the block iteration loop below). + Index input_offset = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offset += input_offsets[dim] * m_inputStrides[dim]; + } + + // Destination buffer and scratch buffer both indexed from 0 and have the + // same dimensions as the requested block (for destination buffer this + // property is guaranteed by `desc.destination()`). + Index output_offset = 0; + const DSizes output_strides = + internal::strides(desc.dimensions()); + + // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1` + // dimensions, skipping innermost dimension. In theory it should be possible + // to squeeze matching innermost dimensions, however in practice that did + // not show any improvements in benchmarks. Also in practice first outer + // dimension usually has padding, and will prevent squeezing. + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (Index i = 0; i < NumDims - 1; ++i) { + const Index dim = IsColMajor ? i + 1 : NumDims - i - 2; + it[i].count = 0; + it[i].size = desc.dimension(dim); + + it[i].input_stride = m_inputStrides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + it[i].output_stride = output_strides[dim]; + it[i].output_span = it[i].output_stride * (it[i].size - 1); + } + + const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1; + + // Total output size. + const Index output_size = desc.size(); + + // We will fill inner dimension of this size in the output. It might be + // larger than the inner dimension in the input, so we might have to pad + // before/after we copy values from the input inner dimension. + const Index output_inner_dim_size = desc.dimension(inner_dim_idx); + + // How many values to fill with padding BEFORE reading from the input inner + // dimension. + const Index output_inner_pad_before_size = + input_offsets[inner_dim_idx] < 0 + ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), + output_inner_dim_size) + : 0; + + // How many values we can actually copy from the input inner dimension. + const Index output_inner_copy_size = numext::mini( + // Want to copy from input. + (output_inner_dim_size - output_inner_pad_before_size), + // Can copy from input. + (static_cast(m_impl.dimensions()[inner_dim_idx]) - + numext::maxi(input_offsets[inner_dim_idx], Index(0)))); + + // How many values to fill with padding AFTER reading from the input inner + // dimension. + const Index output_inner_pad_after_size = + (output_inner_dim_size - output_inner_copy_size - + output_inner_pad_before_size); + + // Sanity check, sum of all sizes must be equal to the output size. + eigen_assert(output_inner_dim_size == + (output_inner_pad_before_size + output_inner_copy_size + + output_inner_pad_after_size)); + + // Keep track of current coordinates and padding in the output. + DSizes output_coord = output_offsets; + DSizes output_padded; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + + typedef internal::StridedLinearBufferCopy LinCopy; + + // Iterate copying data from `m_impl.data()` to the output buffer. + for (Index size = 0; size < output_size; size += output_inner_dim_size) { + // Detect if we are in the padded region (exclude innermost dimension). + bool is_padded = false; + for (int j = 1; j < NumDims; ++j) { + const int dim = IsColMajor ? j : NumDims - j - 1; + is_padded = output_padded[dim]; + if (is_padded) break; + } + + if (is_padded) { + // Fill with padding value. + LinCopy::template Run( + typename LinCopy::Dst(output_offset, 1, materialized_output), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_dim_size); + + } else { + { // Fill with padding before copying from input inner dimension. + const Index out = output_offset; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_before_size); + } + + { // Copy data from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size; + const Index in = input_offset + output_inner_pad_before_size; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Src(in, 1, m_impl.data()), + output_inner_copy_size); + } + + { // Fill with padding after copying from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size + + output_inner_copy_size; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_after_size); + } + } + + for (int j = 0; j < NumDims - 1; ++j) { + const int dim = IsColMajor ? j + 1 : NumDims - j - 2; + + if (++it[j].count < it[j].size) { + input_offset += it[j].input_stride; + output_offset += it[j].output_stride; + output_coord[dim] += 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + break; + } + it[j].count = 0; + input_offset -= it[j].input_span; + output_offset -= it[j].output_span; + output_coord[dim] -= it[j].size - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + } + + return TensorBlockV2(materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + materialized_output, + desc.dimensions()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL @@ -222,6 +437,23 @@ struct TensorEvaluator, Device #endif private: + struct BlockIteratorState { + BlockIteratorState() + : count(0), + size(0), + input_stride(0), + input_span(0), + output_stride(0), + output_span(0) {} + + Index count; + Index size; + Index input_stride; + Index input_span; + Index output_stride; + Index output_span; + }; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( Index index, int dim_index) const { #if defined(EIGEN_HAS_INDEX_LIST) @@ -410,6 +642,8 @@ struct TensorEvaluator, Device PaddingDimensions m_padding; Scalar m_paddingValue; + + const Device EIGEN_DEVICE_REF m_device; }; -- cgit v1.2.3