From 33e174613987cfc6c83576dc0fe8086c7a5d1b1f Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 9 Oct 2019 12:45:31 -0700 Subject: Block evaluation for TensorChipping + fixed bugs in TensorPadding and TensorSlicing --- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 125 +++++++++++++++++++-- 1 file changed, 118 insertions(+), 7 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 8860840a7..20591da33 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -149,7 +149,7 @@ struct TensorEvaluator, Device> Layout = TensorEvaluator::Layout, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator::BlockAccessV2, // Chipping of outer-most dimension is a trivial operation, because we can // read and write directly from the underlying tensor using single offset. IsOuterChipping = (static_cast(Layout) == ColMajor && DimId == NumInputDims - 1) || @@ -171,7 +171,17 @@ struct TensorEvaluator, Device> OutputTensorBlock; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef internal::TensorBlockDescriptor + ArgTensorBlockDesc; + typedef typename TensorEvaluator::TensorBlockV2 + ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -357,6 +367,72 @@ struct TensorEvaluator, Device> m_impl.block(&input_block); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + const Index chip_dim = m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] = i < chip_dim ? desc.dimension(i) + : i > chip_dim ? desc.dimension(i - 1) + : 1; + } + + ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims); + + // Try to reuse destination buffer for materializing argument block. + ScalarNoConst* destination_buffer = + desc.template destination(); + if (destination_buffer != NULL) { + arg_desc.AddDestinationBuffer( + destination_buffer, internal::strides(arg_desc.dimensions()), + (arg_desc.size() * sizeof(Scalar))); + } + + ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch); + + if (arg_block.data() != NULL) { + // Forward argument block buffer if possible. + return TensorBlockV2(arg_block.kind(), arg_block.data(), + desc.dimensions()); + + } else { + // Assign argument block expression to a buffer. + + // Try to reuse destination as an output buffer. + ScalarNoConst* output_buffer = + desc.template destination(); + bool materialized_in_output; + + if (output_buffer != NULL) { + materialized_in_output = true; + + } else { + materialized_in_output = false; + const size_t materialized_output_size = desc.size() * sizeof(Scalar); + void* output_scratch_mem = scratch.allocate(materialized_output_size); + output_buffer = static_cast(output_scratch_mem); + } + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + arg_desc.dimensions(), + internal::strides(arg_desc.dimensions()), + output_buffer), + arg_block.expr()); + + return TensorBlockV2( + materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + output_buffer, desc.dimensions()); + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { typename Storage::Type result = constCast(m_impl.data()); if (isOuterChipping() && result) { @@ -434,11 +510,12 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + BlockAccessV2 = TensorEvaluator::RawAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; typedef typename internal::remove_const::type ScalarNoConst; @@ -448,6 +525,10 @@ struct TensorEvaluator, Device> typedef internal::TensorBlock OutputTensorBlock; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -539,6 +620,36 @@ struct TensorEvaluator, Device> input_block_strides, this->m_inputStrides, const_cast(output_block.data()))); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( + const TensorBlockDesc& desc, const TensorBlockV2& block) { + assert(this->m_impl.data() != NULL); + + const Index chip_dim = this->m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] = i < chip_dim ? desc.dimension(i) + : i > chip_dim ? desc.dimension(i - 1) + : 1; + } + + typedef TensorReshapingOp, + const typename TensorBlockV2::XprType> + TensorBlockExpr; + + typedef internal::TensorBlockAssignment + TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target( + input_block_dims, + internal::strides(this->m_impl.dimensions()), + this->m_impl.data(), this->srcCoeff(desc.offset())), + block.expr().reshape(input_block_dims)); + } }; -- cgit v1.2.3