From d380c23b2cc0b02e10819e779c73cde2c62603b2 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 14 Oct 2019 14:31:59 -0700 Subject: Block evaluation for TensorGenerator/TensorReverse/TensorShuffling --- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 147 ++++++++++++++++++++- 1 file changed, 141 insertions(+), 6 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 5e8abad75..bb9908b62 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -116,7 +116,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = TensorEvaluator::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -131,7 +131,12 @@ struct TensorEvaluator, Device> TensorBlockReader; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, @@ -143,6 +148,7 @@ struct TensorEvaluator, Device> const Shuffle& shuffle = op.shufflePermutation(); m_is_identity = true; for (int i = 0; i < NumDims; ++i) { + m_shuffle[i] = static_cast(shuffle[i]); m_dimensions[i] = input_dims[shuffle[i]]; m_inverseShuffle[shuffle[i]] = i; if (m_is_identity && shuffle[i] != i) { @@ -241,7 +247,6 @@ struct TensorEvaluator, Device> 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); resources->push_back(internal::TensorOpResourceRequirements( internal::kUniformAllDims, block_total_size_max)); - m_impl.getResourceRequirements(resources); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( @@ -336,6 +341,78 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool root_of_expr_ast = false) const { + assert(m_impl.data() != NULL); + + typedef internal::TensorBlockIOV2 + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + ScalarNoConst* block_buffer = NULL; + typename TensorBlockIO::Dimensions block_strides; + + bool materialized_in_output = false; + bool has_valid_materialized_expr = true; + + if (desc.HasDestinationBuffer()) { + // Check if we can reuse destination buffer for block materialization. + const typename TensorBlockDesc::DestinationBuffer& destination_buffer = + desc.GetDestinationBuffer(); + + const bool dims_match = dimensions_match( + desc.dimensions(), destination_buffer.template dimensions()); + + const bool strides_match = + dimensions_match(internal::strides(desc.dimensions()), + destination_buffer.template strides()); + + if (dims_match && strides_match) { + // Destination buffer fits the block contiguously. + materialized_in_output = true; + has_valid_materialized_expr = true; + block_buffer = destination_buffer.template data(); + block_strides = internal::strides(desc.dimensions()); + eigen_assert(block_buffer != NULL); + + } else if (dims_match && root_of_expr_ast) { + // Destination buffer has strides not matching the block strides, but + // for the root of the expression tree it's safe to materialize anyway. + materialized_in_output = true; + has_valid_materialized_expr = false; + block_buffer = destination_buffer.template data(); + block_strides = destination_buffer.template strides(); + eigen_assert(block_buffer != NULL); + } + + if (materialized_in_output) desc.DropDestinationBuffer(); + } + + // If we were not able to reuse destination buffer, allocate temporary + // buffer for block evaluation using scratch allocator. + if (!materialized_in_output) { + void* mem = scratch.allocate(desc.size() * sizeof(ScalarNoConst)); + block_buffer = static_cast(mem); + block_strides = internal::strides(desc.dimensions()); + } + + typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides); + TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset())); + + TensorBlockIODst dst(desc.dimensions(), block_strides, block_buffer); + + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle); + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + return TensorBlockV2( + materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + block_buffer, desc.dimensions(), has_valid_materialized_expr); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { const double compute_cost = m_is_identity ? TensorOpCost::AddCost() : NumDims * (2 * TensorOpCost::AddCost() + @@ -400,7 +477,8 @@ struct TensorEvaluator, Device> Dimensions m_dimensions; bool m_is_identity; - array m_inverseShuffle; + array m_shuffle; + array m_inverseShuffle; // TODO(ezhulenev): Make it int type. array m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; @@ -431,7 +509,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = TensorEvaluator::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator::RawAccess, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, RawAccess = false @@ -445,7 +523,7 @@ struct TensorEvaluator, Device> TensorBlockWriter; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor TensorBlockDesc; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -477,6 +555,63 @@ struct TensorEvaluator, Device> this->m_inverseShuffle, this->m_unshuffledInputStrides, this->m_impl.data()); } + +template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( + const TensorBlockDesc& desc, const TensorBlockV2& block) { + eigen_assert(this->m_impl.data() != NULL); + + typedef internal::TensorBlockIOV2 + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + const Scalar* block_buffer = block.data(); + + // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen + // expression with coefficient and packet access as `src`. + void* mem = NULL; + if (block_buffer == NULL) { + mem = this->m_device.allocate(desc.size() * sizeof(Scalar)); + ScalarNoConst* buf = static_cast(mem); + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumDims, typename TensorBlockV2::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + desc.dimensions(), internal::strides(desc.dimensions()), + buf), + block.expr()); + + block_buffer = buf; + } + + // Read from block. + TensorBlockIOSrc src(internal::strides(desc.dimensions()), + block_buffer); + + // Write to the output buffer. + typename TensorBlockIO::Dimensions output_strides( + this->m_unshuffledInputStrides); + typename TensorBlockIO::Dimensions output_dimensions; + for (int i = 0; i < NumDims; ++i) { + output_dimensions[this->m_shuffle[i]] = desc.dimension(i); + } + TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(), + this->srcCoeff(desc.offset())); + + // Reorder dimensions according to the shuffle. + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map; + for (int i = 0; i < NumDims; ++i) { + dst_to_src_dim_map[i] = static_cast(this->m_inverseShuffle[i]); + } + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + // Deallocate temporary buffer used for the block materialization. + if (mem != NULL) this->m_device.deallocate(mem); + } }; -- cgit v1.2.3