From d380c23b2cc0b02e10819e779c73cde2c62603b2 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 14 Oct 2019 14:31:59 -0700 Subject: Block evaluation for TensorGenerator/TensorReverse/TensorShuffling --- unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 5 +- unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h | 112 ++++++---- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 15 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 13 -- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 33 ++- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 5 +- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 7 +- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 147 +++++++++++- unsupported/test/cxx11_tensor_block_eval.cpp | 246 ++++++++++++++++----- unsupported/test/cxx11_tensor_executor.cpp | 131 ++++++----- 16 files changed, 546 insertions(+), 188 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 29aa7a97e..f2b9389c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -242,9 +242,8 @@ struct TensorEvaluator, Device> (internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar))); } - RightTensorBlock block = m_rightImpl.blockV2(desc, scratch); - // If block was evaluated into a destination, there is no need to do - // assignment. + RightTensorBlock block = m_rightImpl.blockV2(desc, scratch, /*root_of_expr_ast=*/true); + // If block was evaluated into a destination, there is no need to do assignment. if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { m_leftImpl.writeBlockV2(desc, block); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h index b8c592543..099d7cd57 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h @@ -45,6 +45,12 @@ EIGEN_ALWAYS_INLINE DSizes strides( return strides; } +template +EIGEN_ALWAYS_INLINE DSizes strides( + const Eigen::array& dimensions) { + return strides(DSizes(dimensions)); +} + #if EIGEN_HAS_CXX11 template EIGEN_STRONG_INLINE DSizes strides( @@ -78,23 +84,24 @@ class TensorBlockDescriptor { return static_cast(m_data); } - private: - friend class TensorBlockDescriptor; - - DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {} + template + Dimensions dimensions() const { + Dimensions dimensions; + for (int i = 0; i < NumDims; ++i) { + eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0); + dimensions[i] = m_dimensions[i] / sizeof(Scalar); + } + return dimensions; + } template - DestinationBuffer(Scalar* data, const Dimensions& dimensions, - const Dimensions& strides, size_t total_dst_bytes) - : m_data(static_cast(data)), - m_dimensions(dimensions), - m_strides(strides), - m_total_dst_bytes(total_dst_bytes) { - // TODO(ezhulenev): Benchmark template meta-unroll for this loop. + Dimensions strides() const { + Dimensions strides; for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] *= sizeof(Scalar); - m_strides[i] *= sizeof(Scalar); + eigen_assert(m_strides[i] % sizeof(Scalar) == 0); + strides[i] = m_strides[i] / sizeof(Scalar); } + return strides; } // Returns true if the tensor block corresponding to `desc` fits into the @@ -109,29 +116,34 @@ class TensorBlockDescriptor { if (!dimensions_match(desc_dims, dst_dims)) return false; const Dimensions& desc_strides = internal::strides(desc_dims); - const Dimensions& dst_strides = internal::strides(dst_dims); + const Dimensions& dst_strides = strides(); - return dimensions_match(desc_strides, dst_strides); - } - - template - Dimensions dimensions() const { - Dimensions dimensions; + // Compare strides ignoring dimensions of size `1`. for (int i = 0; i < NumDims; ++i) { - eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0); - dimensions[i] = m_dimensions[i] / sizeof(Scalar); + if (desc_dims[i] == 1) continue; + if (desc_strides[i] != dst_strides[i]) return false; } - return dimensions; + + return true; } + private: + friend class TensorBlockDescriptor; + + DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {} + template - Dimensions strides() const { - Dimensions strides; + DestinationBuffer(Scalar* data, const Dimensions& dimensions, + const Dimensions& strides, size_t total_dst_bytes) + : m_data(static_cast(data)), + m_dimensions(dimensions), + m_strides(strides), + m_total_dst_bytes(total_dst_bytes) { + // TODO(ezhulenev): Benchmark template meta-unroll for this loop. for (int i = 0; i < NumDims; ++i) { - eigen_assert(m_strides[i] % sizeof(Scalar) == 0); - strides[i] = m_strides[i] / sizeof(Scalar); + m_dimensions[i] *= sizeof(Scalar); + m_strides[i] *= sizeof(Scalar); } - return strides; } void* m_data; @@ -181,6 +193,12 @@ class TensorBlockDescriptor { return *this; } + bool HasDestinationBuffer() const { return m_destination.m_data != NULL; } + + const DestinationBuffer& GetDestinationBuffer() const { + return m_destination; + } + // Returns a non-nullptr pointer to a destination buffer memory if this // block has a contiguous destination buffer. template @@ -191,6 +209,11 @@ class TensorBlockDescriptor { return NULL; } + // Returns a copy of `*this` with updated offset. + TensorBlockDescriptor WithOffset(IndexType offset) const { + return TensorBlockDescriptor(offset, m_dimensions, m_destination); + } + private: // Offset and dimensions are immutable after construction. Block descriptor // can only be mutated by adding or dropping destination. @@ -294,18 +317,12 @@ enum TensorBlockKind { // Tensor block that was materialized directly into the final output memory // buffer. For example if the left side of an assignment is a Tensor, we can - // directly materialize the block in the destination memory. The block - // expression is still a valid Tensor expression, and can be used to build - // lazy expressions. + // directly materialize the block in the destination memory. + // + // If strides in the output buffer do not match tensor block strides, the + // Tensor expression will be invalid, and should not be used by + // TensorBlockAssign or for constructing another block expression. kMaterializedInOutput - - // TODO(ezhulenev): If we know that we are evaluating a block, for the root of - // the expression tree, it might be beneficial to do an assignment to the - // output memory buffer, even if it will be impossible to construct a valid - // block expression after that (e.g. output memory buffer has strides not - // compatible with TensorMap). This might be a performance optimization for - // uniformly shaped blocks, because for blocks skewed towards inner dimension - // `kMaterializedInOutput` should always work. }; #if !EIGEN_HAS_CXX11 } // namespace TensorBlockKind @@ -346,6 +363,11 @@ struct XprScalar { // Tensor), or a memory buffer allocated with scratch allocator, and in this // case the scratch allocator will deallocate it at the end of block based // expression execution. +// +// If the block was evaluated directly into the output buffer, and strides in +// the output buffer do not match block strides, the TensorMap expression will +// be invalid, and should never be used in block assignment or any other tensor +// expression. template @@ -358,11 +380,12 @@ class TensorMaterializedBlock { typedef TensorMap > XprType; TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, - const Dimensions& dimensions) + const Dimensions& dimensions, bool valid_expr = true) : m_kind(kind), m_data(data), m_dimensions(dimensions), - m_expr(m_data, m_dimensions) { + m_expr(m_data, m_dimensions), + m_valid_expr(valid_expr) { eigen_assert(m_kind == internal::TensorBlockKind::kView || m_kind == internal::TensorBlockKind::kMaterializedInScratch || m_kind == internal::TensorBlockKind::kMaterializedInOutput); @@ -372,7 +395,10 @@ class TensorMaterializedBlock { // NOTE(ezhulenev): Returning XprType by value like in other block types // causes asan failures. The theory is that XprType::Nested doesn't work // properly for TensorMap. - const XprType& expr() const { return m_expr; } + const XprType& expr() const { + eigen_assert(m_valid_expr); + return m_expr; + } const Scalar* data() const { return m_data; } void cleanup() {} @@ -427,6 +453,7 @@ class TensorMaterializedBlock { bool materialized_in_output; if (block_buffer != NULL) { + desc.DropDestinationBuffer(); materialized_in_output = true; } else { @@ -461,6 +488,7 @@ class TensorMaterializedBlock { const Scalar* m_data; Dimensions m_dimensions; XprType m_expr; + bool m_valid_expr; }; // -------------------------------------------------------------------------- // diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index dc9551d32..cc0a00e8d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -882,7 +882,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { static const bool is_col_major = static_cast(Layout) == static_cast(ColMajor); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 20591da33..7eaf1f09e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -368,7 +368,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { const Index chip_dim = m_dim.actualDim(); DSizes input_block_dims; @@ -390,6 +391,7 @@ struct TensorEvaluator, Device> } ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); if (arg_block.data() != NULL) { // Forward argument block buffer if possible. @@ -405,6 +407,7 @@ struct TensorEvaluator, Device> bool materialized_in_output; if (output_buffer != NULL) { + desc.DropDestinationBuffer(); materialized_in_output = true; } else { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index cc3e67677..2a6d67ad5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -404,7 +404,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { return TensorBlockV2(m_impl.blockV2(desc, scratch), TensorConversionOpBlockFactory()); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index d7bebd30b..132458a20 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -481,7 +481,7 @@ struct sizes_match_below_dim { template -EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) { return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index b77d8fe84..4c2767d44 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -166,7 +166,8 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { assert(m_data != NULL); return TensorBlockV2::materialize(m_data, m_dims, desc, scratch); } @@ -353,7 +354,8 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { assert(m_data != NULL); return TensorBlockV2::materialize(m_data, m_dims, desc, scratch); } @@ -571,7 +573,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor); } @@ -729,7 +732,8 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { // It's unsafe to pass destination buffer to underlying expressions, because // output might be aliased with one of the inputs. desc.DropDestinationBuffer(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 6ad6327a6..97ac96db1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -521,19 +521,6 @@ class TensorExecutor::value) { - internal::TensorExecutor::run(expr, device); - evaluator.cleanup(); - return; - } const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index d98af1355..7d12e781e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -176,7 +176,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { assert(m_buffer != NULL); return TensorBlockV2::materialize(m_buffer, m_impl.dimensions(), desc, scratch); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index 38d0bf7d3..c69e2df92 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -238,7 +238,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { static const bool is_col_major = static_cast(Layout) == static_cast(ColMajor); @@ -253,6 +254,7 @@ struct TensorEvaluator, Device> bool materialized_in_output; if (block_buffer != NULL) { + desc.DropDestinationBuffer(); materialized_in_output = true; } else { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index c9d78ba9b..ab3a979a8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -365,7 +365,8 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { eigen_assert(m_impl.data() != NULL); eigen_assert((kind == Runtime) || (kind == OneByN && desc.dimensions()[0] == 1) || @@ -611,7 +612,7 @@ struct TensorEvaluator, Devi IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator::BlockAccessV2, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -624,7 +625,12 @@ struct TensorEvaluator, Devi typedef typename TensorBlock::Dimensions TensorBlockDimensions; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + // Tensor slicing does not change the block type. + typedef typename TensorEvaluator::TensorBlockV2 + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -804,6 +810,15 @@ struct TensorEvaluator, Devi m_impl.block(&input_block); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset())); + TensorBlockV2 block = m_impl.blockV2(arg_desc, scratch); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + return block; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { typename Storage::Type result = constCast(m_impl.data()); if (result) { @@ -900,7 +915,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator::BlockAccessV2, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -913,7 +928,8 @@ struct TensorEvaluator, Device> typedef typename TensorBlock::Dimensions TensorBlockDimensions; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -987,6 +1003,13 @@ struct TensorEvaluator, Device> block.block_strides(), TensorBlockDimensions(this->m_inputStrides), const_cast(block.data()))); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( + const TensorBlockDesc& desc, const TensorBlockV2& block) { + TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset())); + this->m_impl.writeBlockV2(arg_desc, block); + } }; namespace internal { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index a0b4e04b1..99c74fc67 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -230,7 +230,8 @@ struct TensorEvaluator, Device } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { // If one of the dimensions is zero, return empty block view. if (desc.size() == 0) { return TensorBlockV2(internal::TensorBlockKind::kView, NULL, @@ -240,8 +241,8 @@ struct TensorEvaluator, Device // Check if we can reuse `desc` destination, or allocate new scratch buffer. ScalarNoConst* materialized_output = desc.template destination(); - bool materialized_in_output; + if (materialized_output != NULL) { desc.DropDestinationBuffer(); materialized_in_output = true; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 6e7abeb09..a51c88540 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -355,7 +355,8 @@ struct TensorEvaluator, Device } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 - blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { // TODO(ezhulenev): If underlying tensor expression supports and prefers // block evaluation we must use it. Currently we use coeff and packet // access into the underlying tensor expression. @@ -370,10 +371,12 @@ struct TensorEvaluator, Device const bool inner_dim_reversed = m_reverse[inner_dim_idx]; // Try to reuse destination as an output block buffer. - CoeffReturnType* block_buffer = desc.template destination(); + CoeffReturnType* block_buffer = + desc.template destination(); bool materialized_in_output; if (block_buffer != NULL) { + desc.DropDestinationBuffer(); materialized_in_output = true; } else { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 5e8abad75..bb9908b62 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -116,7 +116,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = TensorEvaluator::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -131,7 +131,12 @@ struct TensorEvaluator, Device> TensorBlockReader; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, @@ -143,6 +148,7 @@ struct TensorEvaluator, Device> const Shuffle& shuffle = op.shufflePermutation(); m_is_identity = true; for (int i = 0; i < NumDims; ++i) { + m_shuffle[i] = static_cast(shuffle[i]); m_dimensions[i] = input_dims[shuffle[i]]; m_inverseShuffle[shuffle[i]] = i; if (m_is_identity && shuffle[i] != i) { @@ -241,7 +247,6 @@ struct TensorEvaluator, Device> 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::kUniformAllDims, block_total_size_max)); - m_impl.getResourceRequirements(resources); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( @@ -336,6 +341,78 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool root_of_expr_ast = false) const { + assert(m_impl.data() != NULL); + + typedef internal::TensorBlockIOV2 + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + ScalarNoConst* block_buffer = NULL; + typename TensorBlockIO::Dimensions block_strides; + + bool materialized_in_output = false; + bool has_valid_materialized_expr = true; + + if (desc.HasDestinationBuffer()) { + // Check if we can reuse destination buffer for block materialization. + const typename TensorBlockDesc::DestinationBuffer& destination_buffer = + desc.GetDestinationBuffer(); + + const bool dims_match = dimensions_match( + desc.dimensions(), destination_buffer.template dimensions()); + + const bool strides_match = + dimensions_match(internal::strides(desc.dimensions()), + destination_buffer.template strides()); + + if (dims_match && strides_match) { + // Destination buffer fits the block contiguously. + materialized_in_output = true; + has_valid_materialized_expr = true; + block_buffer = destination_buffer.template data(); + block_strides = internal::strides(desc.dimensions()); + eigen_assert(block_buffer != NULL); + + } else if (dims_match && root_of_expr_ast) { + // Destination buffer has strides not matching the block strides, but + // for the root of the expression tree it's safe to materialize anyway. + materialized_in_output = true; + has_valid_materialized_expr = false; + block_buffer = destination_buffer.template data(); + block_strides = destination_buffer.template strides(); + eigen_assert(block_buffer != NULL); + } + + if (materialized_in_output) desc.DropDestinationBuffer(); + } + + // If we were not able to reuse destination buffer, allocate temporary + // buffer for block evaluation using scratch allocator. + if (!materialized_in_output) { + void* mem = scratch.allocate(desc.size() * sizeof(ScalarNoConst)); + block_buffer = static_cast(mem); + block_strides = internal::strides(desc.dimensions()); + } + + typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides); + TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset())); + + TensorBlockIODst dst(desc.dimensions(), block_strides, block_buffer); + + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle); + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + return TensorBlockV2( + materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + block_buffer, desc.dimensions(), has_valid_materialized_expr); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { const double compute_cost = m_is_identity ? TensorOpCost::AddCost() : NumDims * (2 * TensorOpCost::AddCost() + @@ -400,7 +477,8 @@ struct TensorEvaluator, Device> Dimensions m_dimensions; bool m_is_identity; - array m_inverseShuffle; + array m_shuffle; + array m_inverseShuffle; // TODO(ezhulenev): Make it int type. array m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; @@ -431,7 +509,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = TensorEvaluator::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, RawAccess = false @@ -445,7 +523,7 @@ struct TensorEvaluator, Device> TensorBlockWriter; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -477,6 +555,63 @@ struct TensorEvaluator, Device> this->m_inverseShuffle, this->m_unshuffledInputStrides, this->m_impl.data()); } + +template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( + const TensorBlockDesc& desc, const TensorBlockV2& block) { + eigen_assert(this->m_impl.data() != NULL); + + typedef internal::TensorBlockIOV2 + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + const Scalar* block_buffer = block.data(); + + // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen + // expression with coefficient and packet access as `src`. + void* mem = NULL; + if (block_buffer == NULL) { + mem = this->m_device.allocate(desc.size() * sizeof(Scalar)); + ScalarNoConst* buf = static_cast(mem); + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumDims, typename TensorBlockV2::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + desc.dimensions(), internal::strides(desc.dimensions()), + buf), + block.expr()); + + block_buffer = buf; + } + + // Read from block. + TensorBlockIOSrc src(internal::strides(desc.dimensions()), + block_buffer); + + // Write to the output buffer. + typename TensorBlockIO::Dimensions output_strides( + this->m_unshuffledInputStrides); + typename TensorBlockIO::Dimensions output_dimensions; + for (int i = 0; i < NumDims; ++i) { + output_dimensions[this->m_shuffle[i]] = desc.dimension(i); + } + TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(), + this->srcCoeff(desc.offset())); + + // Reorder dimensions according to the shuffle. + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map; + for (int i = 0; i < NumDims; ++i) { + dst_to_src_dim_map[i] = static_cast(this->m_inverseShuffle[i]); + } + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + // Deallocate temporary buffer used for the block materialization. + if (mem != NULL) this->m_device.deallocate(mem); + } }; diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp index aac75014c..eb93c5c53 100644 --- a/unsupported/test/cxx11_tensor_block_eval.cpp +++ b/unsupported/test/cxx11_tensor_block_eval.cpp @@ -139,23 +139,50 @@ static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) { // Evaluate TensorBlock expression into a tensor. Tensor block(block_params.desc.dimensions()); + // Dimensions for the potential destination buffer. + DSizes dst_dims; + if (internal::random()) { + dst_dims = block_params.desc.dimensions(); + } else { + for (int i = 0; i < NumDims; ++i) { + Index extent = internal::random(0, 5); + dst_dims[i] = block_params.desc.dimension(i) + extent; + } + } + // Maybe use this tensor as a block desc destination. - Tensor dst(block_params.desc.dimensions()); + Tensor dst(dst_dims); + dst.setZero(); if (internal::random()) { block_params.desc.template AddDestinationBuffer( dst.data(), internal::strides(dst.dimensions()), dst.dimensions().TotalSize() * sizeof(T)); } - auto tensor_block = eval.blockV2(block_params.desc, scratch); - auto b_expr = tensor_block.expr(); + const bool root_of_expr = internal::random(); + auto tensor_block = eval.blockV2(block_params.desc, scratch, root_of_expr); + + if (tensor_block.kind() == internal::TensorBlockKind::kMaterializedInOutput) { + // Copy data from destination buffer. + if (dimensions_match(dst.dimensions(), block.dimensions())) { + block = dst; + } else { + DSizes offsets; + for (int i = 0; i < NumDims; ++i) offsets[i] = 0; + block = dst.slice(offsets, block.dimensions()); + } - // We explicitly disable vectorization and tiling, to run a simple coefficient - // wise assignment loop, because it's very simple and should be correct. - using BlockAssign = TensorAssignOp; - using BlockExecutor = TensorExecutor; - BlockExecutor::run(BlockAssign(block, b_expr), d); + } else { + // Assign to block from expression. + auto b_expr = tensor_block.expr(); + + // We explicitly disable vectorization and tiling, to run a simple coefficient + // wise assignment loop, because it's very simple and should be correct. + using BlockAssign = TensorAssignOp; + using BlockExecutor = TensorExecutor; + BlockExecutor::run(BlockAssign(block, b_expr), d); + } // Cleanup temporary buffers owned by a tensor block. tensor_block.cleanup(); @@ -375,17 +402,16 @@ static void test_eval_tensor_generator() { Tensor input(dims); input.setRandom(); - auto generator = [](const array& dims) -> T { + auto generator = [](const array& coords) -> T { T result = static_cast(0); for (int i = 0; i < NumDims; ++i) { - result += static_cast((i + 1) * dims[i]); + result += static_cast((i + 1) * coords[i]); } return result; }; VerifyBlockEvaluator( - input.generate(generator), - [&dims]() { return FixedSizeBlock(dims); }); + input.generate(generator), [&dims]() { return FixedSizeBlock(dims); }); VerifyBlockEvaluator( input.generate(generator), @@ -403,12 +429,63 @@ static void test_eval_tensor_reverse() { for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random(); VerifyBlockEvaluator( - input.reverse(reverse), - [&dims]() { return FixedSizeBlock(dims); }); + input.reverse(reverse), [&dims]() { return FixedSizeBlock(dims); }); + + VerifyBlockEvaluator(input.reverse(reverse), [&dims]() { + return RandomBlock(dims, 1, 10); + }); +} + +template +static void test_eval_tensor_slice() { + DSizes dims = RandomDims(10, 20); + Tensor input(dims); + input.setRandom(); + + // Pick a random slice of an input tensor. + DSizes slice_start = RandomDims(5, 10); + DSizes slice_size = RandomDims(5, 10); + + // Make sure that slice start + size do not overflow tensor dims. + for (int i = 0; i < NumDims; ++i) { + slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); + slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); + } + + VerifyBlockEvaluator( + input.slice(slice_start, slice_size), + [&slice_size]() { return FixedSizeBlock(slice_size); }); VerifyBlockEvaluator( - input.reverse(reverse), - [&dims]() { return RandomBlock(dims, 1, 10); }); + input.slice(slice_start, slice_size), + [&slice_size]() { return RandomBlock(slice_size, 1, 10); }); +} + +template +static void test_eval_tensor_shuffle() { + DSizes dims = RandomDims(5, 15); + Tensor input(dims); + input.setRandom(); + + DSizes shuffle; + for (int i = 0; i < NumDims; ++i) shuffle[i] = i; + + do { + DSizes shuffled_dims; + for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]]; + + VerifyBlockEvaluator( + input.shuffle(shuffle), + [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); }); + + VerifyBlockEvaluator( + input.shuffle(shuffle), [&shuffled_dims]() { + return RandomBlock(shuffled_dims, 1, 5); + }); + + break; + + } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); } template @@ -564,7 +641,7 @@ static void test_assign_to_tensor_chipping() { Index chip_dim = internal::random(0, NumDims - 1); Index chip_offset = internal::random(0, dims[chip_dim] - 2); - DSizes < Index, NumDims - 1 > chipped_dims; + DSizes chipped_dims; for (Index i = 0; i < chip_dim; ++i) { chipped_dims[i] = dims[i]; } @@ -587,42 +664,111 @@ static void test_assign_to_tensor_chipping() { [&chipped_dims]() { return FixedSizeBlock(chipped_dims); }); } -// -------------------------------------------------------------------------- // +template +static void test_assign_to_tensor_slice() { + DSizes dims = RandomDims(10, 20); + Tensor tensor(dims); + + // Pick a random slice of tensor. + DSizes slice_start = RandomDims(5, 10); + DSizes slice_size = RandomDims(5, 10); + + // Make sure that slice start + size do not overflow tensor dims. + for (int i = 0; i < NumDims; ++i) { + slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); + slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); + } + + TensorMap> map(tensor.data(), dims); + + VerifyBlockAssignment( + tensor, map.slice(slice_start, slice_size), + [&slice_size]() { return RandomBlock(slice_size, 1, 10); }); + + VerifyBlockAssignment( + tensor, map.slice(slice_start, slice_size), + [&slice_size]() { return SkewedInnerBlock(slice_size); }); -#define CALL_SUBTESTS_DIMS_LAYOUTS(NAME) \ - CALL_SUBTEST((NAME())); \ - CALL_SUBTEST((NAME())); \ - CALL_SUBTEST((NAME())); \ - CALL_SUBTEST((NAME())); \ - CALL_SUBTEST((NAME())); \ - CALL_SUBTEST((NAME())); \ - CALL_SUBTEST((NAME())); \ - CALL_SUBTEST((NAME())) + VerifyBlockAssignment( + tensor, map.slice(slice_start, slice_size), + [&slice_size]() { return FixedSizeBlock(slice_size); }); +} + +template +static void test_assign_to_tensor_shuffle() { + DSizes dims = RandomDims(5, 15); + Tensor tensor(dims); + + DSizes shuffle; + for (int i = 0; i < NumDims; ++i) shuffle[i] = i; + + TensorMap> map(tensor.data(), dims); -#define CALL_SUBTESTS_LAYOUTS(NAME) \ - CALL_SUBTEST((NAME())); \ - CALL_SUBTEST((NAME())) + do { + DSizes shuffled_dims; + for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]]; + + VerifyBlockAssignment( + tensor, map.shuffle(shuffle), + [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); }); + + VerifyBlockAssignment( + tensor, map.shuffle(shuffle), [&shuffled_dims]() { + return RandomBlock(shuffled_dims, 1, 5); + }); + + } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); +} + +// -------------------------------------------------------------------------- // + +#define CALL_SUBTEST_PART(PART) \ + CALL_SUBTEST_##PART + +#define CALL_SUBTESTS_DIMS_LAYOUTS(PART, NAME) \ + CALL_SUBTEST_PART(PART)((NAME())); \ + CALL_SUBTEST_PART(PART)((NAME())); \ + CALL_SUBTEST_PART(PART)((NAME())); \ + CALL_SUBTEST_PART(PART)((NAME())); \ + CALL_SUBTEST_PART(PART)((NAME())); \ + CALL_SUBTEST_PART(PART)((NAME())); \ + CALL_SUBTEST_PART(PART)((NAME())); \ + CALL_SUBTEST_PART(PART)((NAME())); \ + CALL_SUBTEST_PART(PART)((NAME())); \ + CALL_SUBTEST_PART(PART)((NAME())) + +#define CALL_SUBTESTS_LAYOUTS(PART, NAME) \ + CALL_SUBTEST_PART(PART)((NAME())); \ + CALL_SUBTEST_PART(PART)((NAME())) EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) { // clang-format off - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_block); - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_unary_expr_block); - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_binary_expr_block); - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_binary_with_unary_expr_block); - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_broadcast); - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reshape); - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_cast); - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_select); - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_padding); - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_chipping); - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_generator); - CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reverse); - - CALL_SUBTESTS_LAYOUTS(test_eval_tensor_reshape_with_bcast); - CALL_SUBTESTS_LAYOUTS(test_eval_tensor_forced_eval); - - CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor); - CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_reshape); - CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_chipping); + CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_block); + CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_unary_expr_block); + CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_binary_expr_block); + CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block); + CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_broadcast); + CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_reshape); + CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_cast); + CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_select); + CALL_SUBTESTS_DIMS_LAYOUTS(3, test_eval_tensor_padding); + CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_chipping); + CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_generator); + CALL_SUBTESTS_DIMS_LAYOUTS(4, test_eval_tensor_reverse); + CALL_SUBTESTS_DIMS_LAYOUTS(5, test_eval_tensor_slice); + CALL_SUBTESTS_DIMS_LAYOUTS(5, test_eval_tensor_shuffle); + + CALL_SUBTESTS_LAYOUTS(6, test_eval_tensor_reshape_with_bcast); + CALL_SUBTESTS_LAYOUTS(6, test_eval_tensor_forced_eval); + + CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor); + CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor_reshape); + CALL_SUBTESTS_DIMS_LAYOUTS(7, test_assign_to_tensor_chipping); + CALL_SUBTESTS_DIMS_LAYOUTS(8, test_assign_to_tensor_slice); + CALL_SUBTESTS_DIMS_LAYOUTS(8, test_assign_to_tensor_shuffle); + + // Force CMake to split this test. + // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8 + // clang-format on } diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp index 66f932746..dd68ddf17 100644 --- a/unsupported/test/cxx11_tensor_executor.cpp +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -21,6 +21,30 @@ using Eigen::internal::TiledEvaluation; // A set of tests to verify that different TensorExecutor strategies yields the // same results for all the ops, supporting tiled evaluation. +// Default assignment that does no use block evaluation or vectorization. +// We assume that default coefficient evaluation is well tested and correct. +template +static void DefaultAssign(Dst& dst, Expr expr) { + using Assign = Eigen::TensorAssignOp; + using Executor = + Eigen::internal::TensorExecutor; + + Executor::run(Assign(dst, expr), DefaultDevice()); +} + +// Assignment with specified device and tiling strategy. +template +static void DeviceAssign(Device& d, Dst& dst, Expr expr) { + using Assign = Eigen::TensorAssignOp; + using Executor = Eigen::internal::TensorExecutor; + + Executor::run(Assign(dst, expr), d); +} + template static array RandomDims(int min_dim = 1, int max_dim = 20) { array dims; @@ -222,30 +246,32 @@ static void test_execute_shuffle_rvalue(Device d) Tensor src(dims); src.setRandom(); - // Create a random dimension re-ordering/shuffle. - std::vector shuffle; - for (int i = 0; i < NumDims; ++i) shuffle.push_back(i); - std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937()); + DSizes shuffle; + for (int i = 0; i < NumDims; ++i) shuffle[i] = i; - const auto expr = src.shuffle(shuffle); + // Test all possible shuffle permutations. + do { + DSizes shuffled_dims; + for (int i = 0; i < NumDims; ++i) { + shuffled_dims[i] = dims[shuffle[i]]; + } - // We assume that shuffling on a default device is tested and correct, so - // we can rely on it to verify correctness of tensor executor and tiling. - Tensor golden; - golden = expr; + const auto expr = src.shuffle(shuffle); - // Now do the shuffling using configured tensor executor. - Tensor dst(golden.dimensions()); + // We assume that shuffling on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor golden(shuffled_dims); + DefaultAssign(golden, expr); - using Assign = TensorAssignOp; - using Executor = - internal::TensorExecutor; + // Now do the shuffling using configured tensor executor. + Tensor dst(shuffled_dims); + DeviceAssign(d, dst, expr); - Executor::run(Assign(dst, expr), d); + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } - for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { - VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); - } + } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); } template src(dims); src.setRandom(); - // Create a random dimension re-ordering/shuffle. - std::vector shuffle; - for (int i = 0; i < NumDims; ++i) shuffle.push_back(i); - std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937()); + DSizes shuffle; + for (int i = 0; i < NumDims; ++i) shuffle[i] = i; - array shuffled_dims; - for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i]; + // Test all possible shuffle permutations. + do { + DSizes shuffled_dims; + for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i]; - // We assume that shuffling on a default device is tested and correct, so - // we can rely on it to verify correctness of tensor executor and tiling. - Tensor golden(shuffled_dims); - golden.shuffle(shuffle) = src; + // We assume that shuffling on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor golden(shuffled_dims); + auto golden_shuffle = golden.shuffle(shuffle); + DefaultAssign(golden_shuffle, src); - // Now do the shuffling using configured tensor executor. - Tensor dst(shuffled_dims); + // Now do the shuffling using configured tensor executor. + Tensor dst(shuffled_dims); + auto dst_shuffle = dst.shuffle(shuffle); + DeviceAssign(d, dst_shuffle, src); - auto expr = dst.shuffle(shuffle); - - using Assign = TensorAssignOp; - using Executor = - internal::TensorExecutor; - - Executor::run(Assign(expr, src), d); + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } - for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { - VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); - } + } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); } template