From d380c23b2cc0b02e10819e779c73cde2c62603b2 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 14 Oct 2019 14:31:59 -0700 Subject: Block evaluation for TensorGenerator/TensorReverse/TensorShuffling --- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 5 +- unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h | 112 ++++++++++------ .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 15 ++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 13 -- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 33 ++++- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 5 +- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 7 +- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 147 ++++++++++++++++++++- 14 files changed, 273 insertions(+), 84 deletions(-) (limited to 'unsupported/Eigen/CXX11') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 29aa7a97e..f2b9389c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -242,9 +242,8 @@ struct TensorEvaluator, Device> (internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar))); } - RightTensorBlock block = m_rightImpl.blockV2(desc, scratch); - // If block was evaluated into a destination, there is no need to do - // assignment. + RightTensorBlock block = m_rightImpl.blockV2(desc, scratch, /*root_of_expr_ast=*/true); + // If block was evaluated into a destination, there is no need to do assignment. if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { m_leftImpl.writeBlockV2(desc, block); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h index b8c592543..099d7cd57 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h @@ -45,6 +45,12 @@ EIGEN_ALWAYS_INLINE DSizes strides( return strides; } +template +EIGEN_ALWAYS_INLINE DSizes strides( + const Eigen::array& dimensions) { + return strides(DSizes(dimensions)); +} + #if EIGEN_HAS_CXX11 template EIGEN_STRONG_INLINE DSizes strides( @@ -78,23 +84,24 @@ class TensorBlockDescriptor { return static_cast(m_data); } - private: - friend class TensorBlockDescriptor; - - DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {} + template + Dimensions dimensions() const { + Dimensions dimensions; + for (int i = 0; i < NumDims; ++i) { + eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0); + dimensions[i] = m_dimensions[i] / sizeof(Scalar); + } + return dimensions; + } template - DestinationBuffer(Scalar* data, const Dimensions& dimensions, - const Dimensions& strides, size_t total_dst_bytes) - : m_data(static_cast(data)), - m_dimensions(dimensions), - m_strides(strides), - m_total_dst_bytes(total_dst_bytes) { - // TODO(ezhulenev): Benchmark template meta-unroll for this loop. + Dimensions strides() const { + Dimensions strides; for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] *= sizeof(Scalar); - m_strides[i] *= sizeof(Scalar); + eigen_assert(m_strides[i] % sizeof(Scalar) == 0); + strides[i] = m_strides[i] / sizeof(Scalar); } + return strides; } // Returns true if the tensor block corresponding to `desc` fits into the @@ -109,29 +116,34 @@ class TensorBlockDescriptor { if (!dimensions_match(desc_dims, dst_dims)) return false; const Dimensions& desc_strides = internal::strides(desc_dims); - const Dimensions& dst_strides = internal::strides(dst_dims); + const Dimensions& dst_strides = strides(); - return dimensions_match(desc_strides, dst_strides); - } - - template - Dimensions dimensions() const { - Dimensions dimensions; + // Compare strides ignoring dimensions of size `1`. for (int i = 0; i < NumDims; ++i) { - eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0); - dimensions[i] = m_dimensions[i] / sizeof(Scalar); + if (desc_dims[i] == 1) continue; + if (desc_strides[i] != dst_strides[i]) return false; } - return dimensions; + + return true; } + private: + friend class TensorBlockDescriptor; + + DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {} + template - Dimensions strides() const { - Dimensions strides; + DestinationBuffer(Scalar* data, const Dimensions& dimensions, + const Dimensions& strides, size_t total_dst_bytes) + : m_data(static_cast(data)), + m_dimensions(dimensions), + m_strides(strides), + m_total_dst_bytes(total_dst_bytes) { + // TODO(ezhulenev): Benchmark template meta-unroll for this loop. for (int i = 0; i < NumDims; ++i) { - eigen_assert(m_strides[i] % sizeof(Scalar) == 0); - strides[i] = m_strides[i] / sizeof(Scalar); + m_dimensions[i] *= sizeof(Scalar); + m_strides[i] *= sizeof(Scalar); } - return strides; } void* m_data; @@ -181,6 +193,12 @@ class TensorBlockDescriptor { return *this; } + bool HasDestinationBuffer() const { return m_destination.m_data != NULL; } + + const DestinationBuffer& GetDestinationBuffer() const { + return m_destination; + } + // Returns a non-nullptr pointer to a destination buffer memory if this // block has a contiguous destination buffer. template @@ -191,6 +209,11 @@ class TensorBlockDescriptor { return NULL; } + // Returns a copy of `*this` with updated offset. + TensorBlockDescriptor WithOffset(IndexType offset) const { + return TensorBlockDescriptor(offset, m_dimensions, m_destination); + } + private: // Offset and dimensions are immutable after construction. Block descriptor // can only be mutated by adding or dropping destination. @@ -294,18 +317,12 @@ enum TensorBlockKind { // Tensor block that was materialized directly into the final output memory // buffer. For example if the left side of an assignment is a Tensor, we can - // directly materialize the block in the destination memory. The block - // expression is still a valid Tensor expression, and can be used to build - // lazy expressions. + // directly materialize the block in the destination memory. + // + // If strides in the output buffer do not match tensor block strides, the + // Tensor expression will be invalid, and should not be used by + // TensorBlockAssign or for constructing another block expression. kMaterializedInOutput - - // TODO(ezhulenev): If we know that we are evaluating a block, for the root of - // the expression tree, it might be beneficial to do an assignment to the - // output memory buffer, even if it will be impossible to construct a valid - // block expression after that (e.g. output memory buffer has strides not - // compatible with TensorMap). This might be a performance optimization for - // uniformly shaped blocks, because for blocks skewed towards inner dimension - // `kMaterializedInOutput` should always work. }; #if !EIGEN_HAS_CXX11 } // namespace TensorBlockKind @@ -346,6 +363,11 @@ struct XprScalar { // Tensor), or a memory buffer allocated with scratch allocator, and in this // case the scratch allocator will deallocate it at the end of block based // expression execution. +// +// If the block was evaluated directly into the output buffer, and strides in +// the output buffer do not match block strides, the TensorMap expression will +// be invalid, and should never be used in block assignment or any other tensor +// expression. template @@ -358,11 +380,12 @@ class TensorMaterializedBlock { typedef TensorMap > XprType; TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, - const Dimensions& dimensions) + const Dimensions& dimensions, bool valid_expr = true) : m_kind(kind), m_data(data), m_dimensions(dimensions), - m_expr(m_data, m_dimensions) { + m_expr(m_data, m_dimensions), + m_valid_expr(valid_expr) { eigen_assert(m_kind == internal::TensorBlockKind::kView || m_kind == internal::TensorBlockKind::kMaterializedInScratch || m_kind == internal::TensorBlockKind::kMaterializedInOutput); @@ -372,7 +395,10 @@ class TensorMaterializedBlock { // NOTE(ezhulenev): Returning XprType by value like in other block types // causes asan failures. The theory is that XprType::Nested doesn't work // properly for TensorMap. - const XprType& expr() const { return m_expr; } + const XprType& expr() const { + eigen_assert(m_valid_expr); + return m_expr; + } const Scalar* data() const { return m_data; } void cleanup() {} @@ -427,6 +453,7 @@ class TensorMaterializedBlock { bool materialized_in_output; if (block_buffer != NULL) { + desc.DropDestinationBuffer(); materialized_in_output = true; } else { @@ -461,6 +488,7 @@ class TensorMaterializedBlock { const Scalar* m_data; Dimensions m_dimensions; XprType m_expr; + bool m_valid_expr; }; // -------------------------------------------------------------------------- // diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index dc9551d32..cc0a00e8d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -882,7 +882,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { static const bool is_col_major = static_cast(Layout) == static_cast(ColMajor); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 20591da33..7eaf1f09e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -368,7 +368,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { const Index chip_dim = m_dim.actualDim(); DSizes input_block_dims; @@ -390,6 +391,7 @@ struct TensorEvaluator, Device> } ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); if (arg_block.data() != NULL) { // Forward argument block buffer if possible. @@ -405,6 +407,7 @@ struct TensorEvaluator, Device> bool materialized_in_output; if (output_buffer != NULL) { + desc.DropDestinationBuffer(); materialized_in_output = true; } else { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index cc3e67677..2a6d67ad5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -404,7 +404,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { return TensorBlockV2(m_impl.blockV2(desc, scratch), TensorConversionOpBlockFactory()); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index d7bebd30b..132458a20 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -481,7 +481,7 @@ struct sizes_match_below_dim { template -EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) { return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index b77d8fe84..4c2767d44 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -166,7 +166,8 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { assert(m_data != NULL); return TensorBlockV2::materialize(m_data, m_dims, desc, scratch); } @@ -353,7 +354,8 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { assert(m_data != NULL); return TensorBlockV2::materialize(m_data, m_dims, desc, scratch); } @@ -571,7 +573,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor); } @@ -729,7 +732,8 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { // It's unsafe to pass destination buffer to underlying expressions, because // output might be aliased with one of the inputs. desc.DropDestinationBuffer(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 6ad6327a6..97ac96db1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -521,19 +521,6 @@ class TensorExecutor::value) { - internal::TensorExecutor::run(expr, device); - evaluator.cleanup(); - return; - } const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index d98af1355..7d12e781e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -176,7 +176,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { assert(m_buffer != NULL); return TensorBlockV2::materialize(m_buffer, m_impl.dimensions(), desc, scratch); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 38d0bf7d3..c69e2df92 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -238,7 +238,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { static const bool is_col_major = static_cast(Layout) == static_cast(ColMajor); @@ -253,6 +254,7 @@ struct TensorEvaluator, Device> bool materialized_in_output; if (block_buffer != NULL) { + desc.DropDestinationBuffer(); materialized_in_output = true; } else { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index c9d78ba9b..ab3a979a8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -365,7 +365,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { eigen_assert(m_impl.data() != NULL); eigen_assert((kind == Runtime) || (kind == OneByN && desc.dimensions()[0] == 1) || @@ -611,7 +612,7 @@ struct TensorEvaluator, Devi IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator::BlockAccessV2, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -624,7 +625,12 @@ struct TensorEvaluator, Devi typedef typename TensorBlock::Dimensions TensorBlockDimensions; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + // Tensor slicing does not change the block type. + typedef typename TensorEvaluator::TensorBlockV2 + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -804,6 +810,15 @@ struct TensorEvaluator, Devi m_impl.block(&input_block); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset())); + TensorBlockV2 block = m_impl.blockV2(arg_desc, scratch); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + return block; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { typename Storage::Type result = constCast(m_impl.data()); if (result) { @@ -900,7 +915,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator::BlockAccessV2, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -913,7 +928,8 @@ struct TensorEvaluator, Device> typedef typename TensorBlock::Dimensions TensorBlockDimensions; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -987,6 +1003,13 @@ struct TensorEvaluator, Device> block.block_strides(), TensorBlockDimensions(this->m_inputStrides), const_cast(block.data()))); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( + const TensorBlockDesc& desc, const TensorBlockV2& block) { + TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset())); + this->m_impl.writeBlockV2(arg_desc, block); + } }; namespace internal { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index a0b4e04b1..99c74fc67 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -230,7 +230,8 @@ struct TensorEvaluator, Device } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { // If one of the dimensions is zero, return empty block view. if (desc.size() == 0) { return TensorBlockV2(internal::TensorBlockKind::kView, NULL, @@ -240,8 +241,8 @@ struct TensorEvaluator, Device // Check if we can reuse `desc` destination, or allocate new scratch buffer. ScalarNoConst* materialized_output = desc.template destination(); - bool materialized_in_output; + if (materialized_output != NULL) { desc.DropDestinationBuffer(); materialized_in_output = true; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 6e7abeb09..a51c88540 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -355,7 +355,8 @@ struct TensorEvaluator, Device } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { // TODO(ezhulenev): If underlying tensor expression supports and prefers // block evaluation we must use it. Currently we use coeff and packet // access into the underlying tensor expression. @@ -370,10 +371,12 @@ struct TensorEvaluator, Device const bool inner_dim_reversed = m_reverse[inner_dim_idx]; // Try to reuse destination as an output block buffer. - CoeffReturnType* block_buffer = desc.template destination(); + CoeffReturnType* block_buffer = + desc.template destination(); bool materialized_in_output; if (block_buffer != NULL) { + desc.DropDestinationBuffer(); materialized_in_output = true; } else { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 5e8abad75..bb9908b62 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -116,7 +116,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = TensorEvaluator::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -131,7 +131,12 @@ struct TensorEvaluator, Device> TensorBlockReader; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, @@ -143,6 +148,7 @@ struct TensorEvaluator, Device> const Shuffle& shuffle = op.shufflePermutation(); m_is_identity = true; for (int i = 0; i < NumDims; ++i) { + m_shuffle[i] = static_cast(shuffle[i]); m_dimensions[i] = input_dims[shuffle[i]]; m_inverseShuffle[shuffle[i]] = i; if (m_is_identity && shuffle[i] != i) { @@ -241,7 +247,6 @@ struct TensorEvaluator, Device> 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::kUniformAllDims, block_total_size_max)); - m_impl.getResourceRequirements(resources); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( @@ -336,6 +341,78 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool root_of_expr_ast = false) const { + assert(m_impl.data() != NULL); + + typedef internal::TensorBlockIOV2 + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + ScalarNoConst* block_buffer = NULL; + typename TensorBlockIO::Dimensions block_strides; + + bool materialized_in_output = false; + bool has_valid_materialized_expr = true; + + if (desc.HasDestinationBuffer()) { + // Check if we can reuse destination buffer for block materialization. + const typename TensorBlockDesc::DestinationBuffer& destination_buffer = + desc.GetDestinationBuffer(); + + const bool dims_match = dimensions_match( + desc.dimensions(), destination_buffer.template dimensions()); + + const bool strides_match = + dimensions_match(internal::strides(desc.dimensions()), + destination_buffer.template strides()); + + if (dims_match && strides_match) { + // Destination buffer fits the block contiguously. + materialized_in_output = true; + has_valid_materialized_expr = true; + block_buffer = destination_buffer.template data(); + block_strides = internal::strides(desc.dimensions()); + eigen_assert(block_buffer != NULL); + + } else if (dims_match && root_of_expr_ast) { + // Destination buffer has strides not matching the block strides, but + // for the root of the expression tree it's safe to materialize anyway. + materialized_in_output = true; + has_valid_materialized_expr = false; + block_buffer = destination_buffer.template data(); + block_strides = destination_buffer.template strides(); + eigen_assert(block_buffer != NULL); + } + + if (materialized_in_output) desc.DropDestinationBuffer(); + } + + // If we were not able to reuse destination buffer, allocate temporary + // buffer for block evaluation using scratch allocator. + if (!materialized_in_output) { + void* mem = scratch.allocate(desc.size() * sizeof(ScalarNoConst)); + block_buffer = static_cast(mem); + block_strides = internal::strides(desc.dimensions()); + } + + typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides); + TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset())); + + TensorBlockIODst dst(desc.dimensions(), block_strides, block_buffer); + + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle); + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + return TensorBlockV2( + materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + block_buffer, desc.dimensions(), has_valid_materialized_expr); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { const double compute_cost = m_is_identity ? TensorOpCost::AddCost() : NumDims * (2 * TensorOpCost::AddCost() + @@ -400,7 +477,8 @@ struct TensorEvaluator, Device> Dimensions m_dimensions; bool m_is_identity; - array m_inverseShuffle; + array m_shuffle; + array m_inverseShuffle; // TODO(ezhulenev): Make it int type. array m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; @@ -431,7 +509,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = TensorEvaluator::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, RawAccess = false @@ -445,7 +523,7 @@ struct TensorEvaluator, Device> TensorBlockWriter; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -477,6 +555,63 @@ struct TensorEvaluator, Device> this->m_inverseShuffle, this->m_unshuffledInputStrides, this->m_impl.data()); } + +template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( + const TensorBlockDesc& desc, const TensorBlockV2& block) { + eigen_assert(this->m_impl.data() != NULL); + + typedef internal::TensorBlockIOV2 + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + const Scalar* block_buffer = block.data(); + + // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen + // expression with coefficient and packet access as `src`. + void* mem = NULL; + if (block_buffer == NULL) { + mem = this->m_device.allocate(desc.size() * sizeof(Scalar)); + ScalarNoConst* buf = static_cast(mem); + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumDims, typename TensorBlockV2::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + desc.dimensions(), internal::strides(desc.dimensions()), + buf), + block.expr()); + + block_buffer = buf; + } + + // Read from block. + TensorBlockIOSrc src(internal::strides(desc.dimensions()), + block_buffer); + + // Write to the output buffer. + typename TensorBlockIO::Dimensions output_strides( + this->m_unshuffledInputStrides); + typename TensorBlockIO::Dimensions output_dimensions; + for (int i = 0; i < NumDims; ++i) { + output_dimensions[this->m_shuffle[i]] = desc.dimension(i); + } + TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(), + this->srcCoeff(desc.offset())); + + // Reorder dimensions according to the shuffle. + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map; + for (int i = 0; i < NumDims; ++i) { + dst_to_src_dim_map[i] = static_cast(this->m_inverseShuffle[i]); + } + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + // Deallocate temporary buffer used for the block materialization. + if (mem != NULL) this->m_device.deallocate(mem); + } }; -- cgit v1.2.3