From 0d2a14ce11c85abdfc68ca37fc66e3cace088b24 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 16 Oct 2019 17:14:37 -0700 Subject: Cleanup Tensor block destination and materialized block storage allocation --- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 6 +- unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h | 258 ++++++++++++--------- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 31 +-- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 55 ++--- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 7 +- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 27 +-- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 34 +-- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 26 +-- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 58 +---- 9 files changed, 216 insertions(+), 286 deletions(-) (limited to 'unsupported/Eigen') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index f2b9389c8..1f64de3a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -235,11 +235,9 @@ struct TensorEvaluator, Device> m_leftImpl.data() != NULL) { // If destination has raw data access, we pass it as a potential // destination for a block descriptor evaluation. - desc.AddDestinationBuffer( + desc.template AddDestinationBuffer( /*dst_base=*/m_leftImpl.data() + desc.offset(), - /*dst_strides=*/internal::strides(m_leftImpl.dimensions()), - /*total_dst_bytes=*/ - (internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar))); + /*dst_strides=*/internal::strides(m_leftImpl.dimensions())); } RightTensorBlock block = m_rightImpl.blockV2(desc, scratch, /*root_of_expr_ast=*/true); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h index 1f89c4e4a..c85c4c6c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h @@ -70,91 +70,89 @@ class TensorBlockDescriptor { // If we evaluate a Tensor assignment, and expression on the left, already has // a memory buffer, then we might do performance optimization, and evaluate - // the root expression directly into the memory, or maybe use it as temporary - // storage for some of the subexpressions, to avoid dynamic memory allocation. + // the root expression directly into the final output memory. Some time it's + // possible to reuse it for materializing subexpressions inside an expression + // tree, to to avoid dynamic memory allocation. // - // This is a type erased storage, because passing Scalar type through all the - // expression evaluation layers it way too many templates. Also it should be - // possible to use this destination as a temp buffer for materializing - // expressions with type, not matching the final output. + // The pointer type of the underlying storage is erased, because passing + // Scalar type through all the expression evaluation layers is way too many + // templates. In practice destination buffer type should always match the + // evaluated expression scalar type. class DestinationBuffer { public: + enum DestinationBufferKind { + // Destination buffer is not defined (`m_data` == nullptr). + kEmpty, + + // Tensor block defined by an owning tensor block descriptor can fit + // contiguously into the destination buffer. In this case it's safe to + // materialize tensor block in the destination buffer, wrap it in a + // TensorMap, and use to build Eigen expression on top of it. + kContiguous, + + // Destination buffer strides do not match strides of the contiguously + // stored block, and it's impossible to define a TensorMap over this + // buffer. However if we are evaluating a root of an expression tree, we + // still can materialize an output into this destination, because we can + // guarantee that no one will ever access it through block API. + // + // In theory it is possible to build valid TensorStriding + // expression on top of this destination buffer, however it has + // inefficient coeff/packet access, and defeats the purpose of fast block + // evaluation API. + kStrided + }; + template Scalar* data() const { + eigen_assert(m_data_type_size == sizeof(Scalar)); return static_cast(m_data); } - template - Dimensions dimensions() const { - Dimensions dimensions; - for (int i = 0; i < NumDims; ++i) { - eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0); - dimensions[i] = m_dimensions[i] / sizeof(Scalar); - } - return dimensions; - } - - template - Dimensions strides() const { - Dimensions strides; - for (int i = 0; i < NumDims; ++i) { - eigen_assert(m_strides[i] % sizeof(Scalar) == 0); - strides[i] = m_strides[i] / sizeof(Scalar); - } - return strides; - } - - // Returns true if the tensor block corresponding to `desc` fits into the - // contiguous block of memory defined by `*this`. - template - bool fitsContiguously(const TensorBlockDescriptor& desc) const { - if (m_data == NULL) return false; - - const Dimensions& desc_dims = desc.dimensions(); - const Dimensions& dst_dims = dimensions(); - - if (!dimensions_match(desc_dims, dst_dims)) return false; - - const Dimensions& desc_strides = internal::strides(desc_dims); - const Dimensions& dst_strides = strides(); - - // Compare strides ignoring dimensions of size `1`. - for (int i = 0; i < NumDims; ++i) { - if (desc_dims[i] == 1) continue; - if (desc_strides[i] != dst_strides[i]) return false; - } - - return true; - } + const Dimensions& strides() const { return m_strides; } + const DestinationBufferKind& kind() const { return m_kind; } private: friend class TensorBlockDescriptor; - DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {} + DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} template - DestinationBuffer(Scalar* data, const Dimensions& dimensions, - const Dimensions& strides, size_t total_dst_bytes) + DestinationBuffer(Scalar* data, const Dimensions& strides, + DestinationBufferKind kind) : m_data(static_cast(data)), - m_dimensions(dimensions), + m_data_type_size(sizeof(Scalar)), m_strides(strides), - m_total_dst_bytes(total_dst_bytes) { - // TODO(ezhulenev): Benchmark template meta-unroll for this loop. + m_kind(kind) {} + + template + static DestinationBuffer make(const TensorBlockDescriptor& desc, + Scalar* data, const Dimensions& strides) { + return DestinationBuffer(data, strides, kind(desc, strides)); + } + + template + static DestinationBufferKind kind(const TensorBlockDescriptor& desc, + const Dimensions& strides) { + const Dimensions& desc_dims = desc.dimensions(); + const Dimensions& desc_strides = internal::strides(desc_dims); for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] *= sizeof(Scalar); - m_strides[i] *= sizeof(Scalar); + if (desc_dims[i] == 1) continue; + if (desc_strides[i] != strides[i]) return kStrided; } + return kContiguous; } + // Storage pointer is type erased, to reduce template bloat, but we still + // keep the size of the underlying element type for error checking. void* m_data; - Dimensions m_dimensions; + size_t m_data_type_size; + + // Destination buffer dimensions always match the dimensions of a tensor + // block descriptor it belongs to, however strides might be different. Dimensions m_strides; - // Total size of the memory buffer at the destination (typically the total - // size of the left hand side of an assignment expression). This can be the - // same as `array_prod(m_dimensions)` if the assignment target has just a - // single block, but typically it's a larger number. - size_t m_total_dst_bytes; + DestinationBufferKind m_kind; }; TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, @@ -173,40 +171,31 @@ class TensorBlockDescriptor { IndexType dimension(int index) const { return m_dimensions[index]; } IndexType size() const { return array_prod(m_dimensions); } - template - void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides, - size_t total_dst_bytes) { + const DestinationBuffer& destination() const { return m_destination; } + + template + void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { + eigen_assert(dst_base != NULL); m_destination = - DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes); + DestinationBuffer::template make(*this, dst_base, dst_strides); } - template + template void AddDestinationBuffer( - Scalar* dst_base, const DSizes& dst_strides, - size_t total_dst_bytes) { + Scalar* dst_base, + const DSizes& dst_strides) { // DSizes constructor will do index type promotion if it's safe. - AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes); + AddDestinationBuffer(*this, dst_base, Dimensions(dst_strides)); } TensorBlockDescriptor& DropDestinationBuffer() { m_destination.m_data = NULL; + m_destination.m_kind = DestinationBuffer::kEmpty; return *this; } - bool HasDestinationBuffer() const { return m_destination.m_data != NULL; } - - const DestinationBuffer& GetDestinationBuffer() const { - return m_destination; - } - - // Returns a non-nullptr pointer to a destination buffer memory if this - // block has a contiguous destination buffer. - template - Scalar* destination() const { - if (m_destination.template fitsContiguously(*this)) { - return m_destination.template data(); - } - return NULL; + bool HasDestinationBuffer() const { + return m_destination.kind() != DestinationBuffer::kEmpty; } // Returns a copy of `*this` with updated offset. @@ -404,6 +393,80 @@ class TensorMaterializedBlock { typedef internal::TensorBlockDescriptor TensorBlockDesc; + // TensorMaterializedBlock can be backed by different types of storage: + // + // (1) Contiguous block of memory allocated with scratch allocator. + // (2) Contiguous block of memory reused from tensor block descriptor + // destination buffer. + // (3) Strided block of memory reused from tensor block descriptor + // destination buffer. + // + class Storage { + public: + Scalar* data() const { return m_data; } + const Dimensions& dimensions() const { return m_dimensions; } + const Dimensions& strides() const { return m_strides; } + + TensorMaterializedBlock AsTensorMaterializedBlock() const { + return TensorMaterializedBlock( + m_materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + m_data, m_dimensions, !m_strided_storage); + } + + private: + friend class TensorMaterializedBlock; + + Storage(Scalar* data, const Dimensions& dimensions, + const Dimensions& strides, bool materialized_in_output, + bool strided_storage) + : m_data(data), + m_dimensions(dimensions), + m_strides(strides), + m_materialized_in_output(materialized_in_output), + m_strided_storage(strided_storage) {} + + Scalar* m_data; + Dimensions m_dimensions; + Dimensions m_strides; + bool m_materialized_in_output; + bool m_strided_storage; + }; + + // Creates a storage for materialized block either from the block descriptor + // destination buffer, or allocates a new buffer with scratch allocator. + template + EIGEN_STRONG_INLINE static Storage prepareStorage( + TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool allow_strided_storage = false) { + // Try to reuse destination as an output block buffer. + typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; + + if (desc.destination().kind() == DestinationBuffer::kContiguous) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), + internal::strides(desc.dimensions()), + /*materialized_in_output=*/true, + /*strided_storage=*/false); + + } else if (desc.destination().kind() == DestinationBuffer::kStrided && + allow_strided_storage) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), desc.destination().strides(), + /*materialized_in_output=*/true, /*strided_storage=*/true); + + } else { + void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); + return Storage(static_cast(mem), desc.dimensions(), + internal::strides(desc.dimensions()), + /*materialized_in_output=*/false, + /*strided_storage=*/false); + } + } + // Creates a materialized block for the given descriptor from a memory buffer. template EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( @@ -448,19 +511,8 @@ class TensorMaterializedBlock { block_start, desc.dimensions()); } else { - // Try to reuse destination as an output block buffer. - Scalar* block_buffer = desc.template destination(); - bool materialized_in_output; - - if (block_buffer != NULL) { - desc.DropDestinationBuffer(); - materialized_in_output = true; - - } else { - materialized_in_output = false; - void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); - block_buffer = static_cast(mem); - } + // Reuse destination buffer or allocate new buffer with scratch allocator. + const Storage storage = prepareStorage(desc, scratch); typedef internal::TensorBlockIOV2 TensorBlockIO; @@ -469,17 +521,11 @@ class TensorMaterializedBlock { TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), data, desc.offset()); - TensorBlockIODst dst(desc.dimensions(), - internal::strides(desc.dimensions()), - block_buffer); + TensorBlockIODst dst(storage.dimensions(), storage.strides(), + storage.data()); TensorBlockIO::Copy(dst, src); - - return TensorMaterializedBlock( - materialized_in_output - ? internal::TensorBlockKind::kMaterializedInOutput - : internal::TensorBlockKind::kMaterializedInScratch, - block_buffer, desc.dimensions()); + return storage.AsTensorMaterializedBlock(); } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 9a1fc9217..58164c13a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -890,24 +890,14 @@ struct TensorEvaluator, Device> return emptyBlock(); } - // Check if we can reuse `desc` destination, or allocate new scratch buffer. - ScalarNoConst* materialized_output = - desc.template destination(); - bool materialized_in_output; + // Prepare storage for the materialized broadcasting result. + const typename TensorBlockV2::Storage block_storage = + TensorBlockV2::prepareStorage(desc, scratch); + ScalarNoConst* materialized_output = block_storage.data(); - if (materialized_output != NULL) { - desc.DropDestinationBuffer(); - materialized_in_output = true; - - } else { - materialized_in_output = false; - const size_t materialized_output_size = desc.size() * sizeof(Scalar); - void* output_scratch_mem = scratch.allocate(materialized_output_size); - materialized_output = static_cast(output_scratch_mem); - } - - ScalarNoConst* materialized_input = NULL; + // We potentially will need to materialize input blocks. size_t materialized_input_size = 0; + ScalarNoConst* materialized_input = NULL; // Initialize block broadcating iterator state for outer dimensions (outer // with regard to bcast dimension). Dimension in this array are always in @@ -951,11 +941,7 @@ struct TensorEvaluator, Device> } } - return TensorBlockV2( - materialized_in_output - ? internal::TensorBlockKind::kMaterializedInOutput - : internal::TensorBlockKind::kMaterializedInScratch, - materialized_output, desc.dimensions()); + return block_storage.AsTensorMaterializedBlock(); } EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } @@ -1019,7 +1005,8 @@ struct TensorEvaluator, Device> Index output_span; }; - BlockBroadcastingParams blockBroadcastingParams(TensorBlockDesc& desc) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams + blockBroadcastingParams(TensorBlockDesc& desc) const { BlockBroadcastingParams params; params.input_dims = Dimensions(m_impl.dimensions()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 7eaf1f09e..fe30f9867 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -369,28 +369,35 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { + bool root_of_expr_ast = false) const { const Index chip_dim = m_dim.actualDim(); DSizes input_block_dims; for (int i = 0; i < NumInputDims; ++i) { - input_block_dims[i] = i < chip_dim ? desc.dimension(i) - : i > chip_dim ? desc.dimension(i - 1) - : 1; + input_block_dims[i] + = i < chip_dim ? desc.dimension(i) + : i > chip_dim ? desc.dimension(i - 1) + : 1; } ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims); // Try to reuse destination buffer for materializing argument block. - ScalarNoConst* destination_buffer = - desc.template destination(); - if (destination_buffer != NULL) { - arg_desc.AddDestinationBuffer( - destination_buffer, internal::strides(arg_desc.dimensions()), - (arg_desc.size() * sizeof(Scalar))); + if (desc.HasDestinationBuffer()) { + DSizes arg_destination_strides; + for (int i = 0; i < NumInputDims; ++i) { + arg_destination_strides[i] + = i < chip_dim ? desc.destination().strides()[i] + : i > chip_dim ? desc.destination().strides()[i - 1] + : 0; // for dimensions of size `1` stride should never be used. + } + + arg_desc.template AddDestinationBuffer( + desc.destination().template data(), + arg_destination_strides); } - ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch); + ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch, root_of_expr_ast); if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); if (arg_block.data() != NULL) { @@ -401,21 +408,9 @@ struct TensorEvaluator, Device> } else { // Assign argument block expression to a buffer. - // Try to reuse destination as an output buffer. - ScalarNoConst* output_buffer = - desc.template destination(); - bool materialized_in_output; - - if (output_buffer != NULL) { - desc.DropDestinationBuffer(); - materialized_in_output = true; - - } else { - materialized_in_output = false; - const size_t materialized_output_size = desc.size() * sizeof(Scalar); - void* output_scratch_mem = scratch.allocate(materialized_output_size); - output_buffer = static_cast(output_scratch_mem); - } + // Prepare storage for the materialized chipping result. + const typename TensorBlockV2::Storage block_storage = + TensorBlockV2::prepareStorage(desc, scratch); typedef internal::TensorBlockAssignment< ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index> @@ -425,14 +420,10 @@ struct TensorEvaluator, Device> TensorBlockAssignment::target( arg_desc.dimensions(), internal::strides(arg_desc.dimensions()), - output_buffer), + block_storage.data()), arg_block.expr()); - return TensorBlockV2( - materialized_in_output - ? internal::TensorBlockKind::kMaterializedInOutput - : internal::TensorBlockKind::kMaterializedInScratch, - output_buffer, desc.dimensions()); + return block_storage.AsTensorMaterializedBlock(); } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 8c44f1c4a..22fc64c1f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -173,12 +173,9 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2( TensorBlockDesc& desc, TensorBlockScratch& scratch) { // Add `m_buffer` as destination buffer to the block descriptor. - desc.AddDestinationBuffer( + desc.template AddDestinationBuffer( /*dst_base=*/m_buffer + desc.offset(), - /*dst_strides=*/internal::strides(m_impl.dimensions()), - /*total_dst_bytes=*/ - (internal::array_prod(m_impl.dimensions()) - * sizeof(Scalar))); + /*dst_strides=*/internal::strides(m_impl.dimensions())); ArgTensorBlock block = m_impl.blockV2(desc, scratch); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index c69e2df92..f590c71be 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -248,21 +248,6 @@ struct TensorEvaluator, Device> extract_coordinates(desc.offset(), coords); array initial_coords = coords; - // Try to reuse destination as an output block buffer. - CoeffReturnType* block_buffer = - desc.template destination(); - bool materialized_in_output; - - if (block_buffer != NULL) { - desc.DropDestinationBuffer(); - materialized_in_output = true; - - } else { - materialized_in_output = false; - void* mem = scratch.allocate(desc.size() * sizeof(CoeffReturnType)); - block_buffer = static_cast(mem); - } - // Offset in the output block buffer. Index offset = 0; @@ -278,6 +263,12 @@ struct TensorEvaluator, Device> } eigen_assert(it[0].stride == 1); + // Prepare storage for the materialized generator result. + const typename TensorBlockV2::Storage block_storage = + TensorBlockV2::prepareStorage(desc, scratch); + + CoeffReturnType* block_buffer = block_storage.data(); + while (it[NumDims - 1].count < it[NumDims - 1].size) { // Generate data for the inner-most dimension. for (Index i = 0; i < it[0].size; ++i) { @@ -304,11 +295,7 @@ struct TensorEvaluator, Device> } } - return TensorBlockV2( - materialized_in_output - ? internal::TensorBlockKind::kMaterializedInOutput - : internal::TensorBlockKind::kMaterializedInScratch, - block_buffer, desc.dimensions()); + return block_storage.AsTensorMaterializedBlock(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 99c74fc67..1104f02c7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -238,22 +238,6 @@ struct TensorEvaluator, Device desc.dimensions()); } - // Check if we can reuse `desc` destination, or allocate new scratch buffer. - ScalarNoConst* materialized_output = - desc.template destination(); - bool materialized_in_output; - - if (materialized_output != NULL) { - desc.DropDestinationBuffer(); - materialized_in_output = true; - - } else { - const size_t materialized_output_size = desc.size() * sizeof(Scalar); - void* output_scratch_mem = scratch.allocate(materialized_output_size); - materialized_output = static_cast(output_scratch_mem); - materialized_in_output = false; - } - static const bool IsColMajor = Layout == static_cast(ColMajor); Index offset = desc.offset(); @@ -363,6 +347,10 @@ struct TensorEvaluator, Device typedef internal::StridedLinearBufferCopy LinCopy; + // Prepare storage for the materialized padding result. + const typename TensorBlockV2::Storage block_storage = + TensorBlockV2::prepareStorage(desc, scratch); + // Iterate copying data from `m_impl.data()` to the output buffer. for (Index size = 0; size < output_size; size += output_inner_dim_size) { // Detect if we are in the padded region (exclude innermost dimension). @@ -376,7 +364,7 @@ struct TensorEvaluator, Device if (is_padded) { // Fill with padding value. LinCopy::template Run( - typename LinCopy::Dst(output_offset, 1, materialized_output), + typename LinCopy::Dst(output_offset, 1, block_storage.data()), typename LinCopy::Src(0, 0, &m_paddingValue), output_inner_dim_size); @@ -385,7 +373,7 @@ struct TensorEvaluator, Device const Index out = output_offset; LinCopy::template Run( - typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Dst(out, 1, block_storage.data()), typename LinCopy::Src(0, 0, &m_paddingValue), output_inner_pad_before_size); } @@ -397,7 +385,7 @@ struct TensorEvaluator, Device eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL); LinCopy::template Run( - typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Dst(out, 1, block_storage.data()), typename LinCopy::Src(in, 1, m_impl.data()), output_inner_copy_size); } @@ -407,7 +395,7 @@ struct TensorEvaluator, Device output_inner_copy_size; LinCopy::template Run( - typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Dst(out, 1, block_storage.data()), typename LinCopy::Src(0, 0, &m_paddingValue), output_inner_pad_after_size); } @@ -431,11 +419,7 @@ struct TensorEvaluator, Device } } - return TensorBlockV2(materialized_in_output - ? internal::TensorBlockKind::kMaterializedInOutput - : internal::TensorBlockKind::kMaterializedInScratch, - materialized_output, - desc.dimensions()); + return block_storage.AsTensorMaterializedBlock(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index a51c88540..ae3ab5f81 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -370,21 +370,6 @@ struct TensorEvaluator, Device static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1; const bool inner_dim_reversed = m_reverse[inner_dim_idx]; - // Try to reuse destination as an output block buffer. - CoeffReturnType* block_buffer = - desc.template destination(); - bool materialized_in_output; - - if (block_buffer != NULL) { - desc.DropDestinationBuffer(); - materialized_in_output = true; - - } else { - materialized_in_output = false; - void* mem = scratch.allocate(desc.size() * sizeof(CoeffReturnType)); - block_buffer = static_cast(mem); - } - // Offset in the output block. Index block_offset = 0; @@ -438,6 +423,11 @@ struct TensorEvaluator, Device const Index inner_dim_size = it[effective_inner_dim].size; + // Prepare storage for the materialized reverse result. + const typename TensorBlockV2::Storage block_storage = + TensorBlockV2::prepareStorage(desc, scratch); + CoeffReturnType* block_buffer = block_storage.data(); + while (it[NumDims - 1].count < it[NumDims - 1].size) { // Copy inner-most dimension data from reversed location in input. Index dst = block_offset; @@ -475,11 +465,7 @@ struct TensorEvaluator, Device } } - return TensorBlockV2( - materialized_in_output - ? internal::TensorBlockKind::kMaterializedInOutput - : internal::TensorBlockKind::kMaterializedInScratch, - block_buffer, desc.dimensions()); + return block_storage.AsTensorMaterializedBlock(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index bb9908b62..df4cd1eb3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -351,66 +351,20 @@ struct TensorEvaluator, Device> typedef typename TensorBlockIO::Dst TensorBlockIODst; typedef typename TensorBlockIO::Src TensorBlockIOSrc; - ScalarNoConst* block_buffer = NULL; - typename TensorBlockIO::Dimensions block_strides; - - bool materialized_in_output = false; - bool has_valid_materialized_expr = true; - - if (desc.HasDestinationBuffer()) { - // Check if we can reuse destination buffer for block materialization. - const typename TensorBlockDesc::DestinationBuffer& destination_buffer = - desc.GetDestinationBuffer(); - - const bool dims_match = dimensions_match( - desc.dimensions(), destination_buffer.template dimensions()); - - const bool strides_match = - dimensions_match(internal::strides(desc.dimensions()), - destination_buffer.template strides()); - - if (dims_match && strides_match) { - // Destination buffer fits the block contiguously. - materialized_in_output = true; - has_valid_materialized_expr = true; - block_buffer = destination_buffer.template data(); - block_strides = internal::strides(desc.dimensions()); - eigen_assert(block_buffer != NULL); - - } else if (dims_match && root_of_expr_ast) { - // Destination buffer has strides not matching the block strides, but - // for the root of the expression tree it's safe to materialize anyway. - materialized_in_output = true; - has_valid_materialized_expr = false; - block_buffer = destination_buffer.template data(); - block_strides = destination_buffer.template strides(); - eigen_assert(block_buffer != NULL); - } - - if (materialized_in_output) desc.DropDestinationBuffer(); - } - - // If we were not able to reuse destination buffer, allocate temporary - // buffer for block evaluation using scratch allocator. - if (!materialized_in_output) { - void* mem = scratch.allocate(desc.size() * sizeof(ScalarNoConst)); - block_buffer = static_cast(mem); - block_strides = internal::strides(desc.dimensions()); - } + const typename TensorBlockV2::Storage block_storage = + TensorBlockV2::prepareStorage( + desc, scratch, /*allow_strided_storage=*/root_of_expr_ast); typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides); TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset())); - TensorBlockIODst dst(desc.dimensions(), block_strides, block_buffer); + TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(), + block_storage.data()); typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle); TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); - return TensorBlockV2( - materialized_in_output - ? internal::TensorBlockKind::kMaterializedInOutput - : internal::TensorBlockKind::kMaterializedInScratch, - block_buffer, desc.dimensions(), has_valid_materialized_expr); + return block_storage.AsTensorMaterializedBlock(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { -- cgit v1.2.3