diff options
author | Eugene Zhulenev <ezhulenev@google.com> | 2019-10-09 12:45:31 -0700 |
---|---|---|
committer | Eugene Zhulenev <ezhulenev@google.com> | 2019-10-09 12:45:31 -0700 |
commit | 33e174613987cfc6c83576dc0fe8086c7a5d1b1f (patch) | |
tree | 4f4c62eab5c0feca0f233624c9c1fc571c491781 /unsupported/Eigen/CXX11/src/Tensor | |
parent | f0a4642baba70a64128964d96c4ede012614925e (diff) |
Block evaluation for TensorChipping + fixed bugs in TensorPadding and TensorSlicing
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
7 files changed, 193 insertions, 33 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h index 3880e7ed3..b8c592543 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h @@ -418,12 +418,22 @@ class TensorMaterializedBlock { if (can_use_direct_access) { const Scalar* block_start = data + desc.offset(); - return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, - desc.dimensions()); + return TensorMaterializedBlock(internal::TensorBlockKind::kView, + block_start, desc.dimensions()); } else { - void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); - Scalar* block_buffer = static_cast<Scalar*>(mem); + // Try to reuse destination as an output block buffer. + Scalar* block_buffer = desc.template destination<Scalar, Layout>(); + bool materialized_in_output; + + if (block_buffer != NULL) { + materialized_in_output = true; + + } else { + materialized_in_output = false; + void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); + block_buffer = static_cast<Scalar*>(mem); + } typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout> TensorBlockIO; @@ -438,8 +448,11 @@ class TensorMaterializedBlock { TensorBlockIO::Copy(dst, src); - return TensorMaterializedBlock(internal::TensorBlockKind::kMaterializedInScratch, - block_buffer, desc.dimensions()); + return TensorMaterializedBlock( + materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + block_buffer, desc.dimensions()); } } @@ -1141,7 +1154,7 @@ class TensorBlockAssignment { it[idx].count = 0; it[idx].size = target.dims[dim]; it[idx].output_stride = target.strides[dim]; - it[idx].output_span = it[i].output_stride * (it[i].size - 1); + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); idx++; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 8860840a7..20591da33 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -149,7 +149,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Layout = TensorEvaluator<ArgType, Device>::Layout, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, - BlockAccessV2 = false, + BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2, // Chipping of outer-most dimension is a trivial operation, because we can // read and write directly from the underlying tensor using single offset. IsOuterChipping = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) || @@ -171,7 +171,17 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> OutputTensorBlock; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef internal::TensorBlockDescriptor<NumInputDims, Index> + ArgTensorBlockDesc; + typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2 + ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, + Layout, Index> + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -357,6 +367,72 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> m_impl.block(&input_block); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + const Index chip_dim = m_dim.actualDim(); + + DSizes<Index, NumInputDims> input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] = i < chip_dim ? desc.dimension(i) + : i > chip_dim ? desc.dimension(i - 1) + : 1; + } + + ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims); + + // Try to reuse destination buffer for materializing argument block. + ScalarNoConst* destination_buffer = + desc.template destination<ScalarNoConst, Layout>(); + if (destination_buffer != NULL) { + arg_desc.AddDestinationBuffer( + destination_buffer, internal::strides<Layout>(arg_desc.dimensions()), + (arg_desc.size() * sizeof(Scalar))); + } + + ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch); + + if (arg_block.data() != NULL) { + // Forward argument block buffer if possible. + return TensorBlockV2(arg_block.kind(), arg_block.data(), + desc.dimensions()); + + } else { + // Assign argument block expression to a buffer. + + // Try to reuse destination as an output buffer. + ScalarNoConst* output_buffer = + desc.template destination<ScalarNoConst, Layout>(); + bool materialized_in_output; + + if (output_buffer != NULL) { + materialized_in_output = true; + + } else { + materialized_in_output = false; + const size_t materialized_output_size = desc.size() * sizeof(Scalar); + void* output_scratch_mem = scratch.allocate(materialized_output_size); + output_buffer = static_cast<ScalarNoConst*>(output_scratch_mem); + } + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + arg_desc.dimensions(), + internal::strides<Layout>(arg_desc.dimensions()), + output_buffer), + arg_block.expr()); + + return TensorBlockV2( + materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + output_buffer, desc.dimensions()); + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { typename Storage::Type result = constCast(m_impl.data()); if (isOuterChipping() && result) { @@ -434,11 +510,12 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> static const int PacketSize = PacketType<CoeffReturnType, Device>::size; enum { - IsAligned = false, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + RawAccess = false }; typedef typename internal::remove_const<Scalar>::type ScalarNoConst; @@ -448,6 +525,10 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> OutputTensorBlock; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -539,6 +620,36 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> input_block_strides, this->m_inputStrides, const_cast<ScalarNoConst*>(output_block.data()))); } + + template <typename TensorBlockV2> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( + const TensorBlockDesc& desc, const TensorBlockV2& block) { + assert(this->m_impl.data() != NULL); + + const Index chip_dim = this->m_dim.actualDim(); + + DSizes<Index, NumInputDims> input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] = i < chip_dim ? desc.dimension(i) + : i > chip_dim ? desc.dimension(i - 1) + : 1; + } + + typedef TensorReshapingOp<const DSizes<Index, NumInputDims>, + const typename TensorBlockV2::XprType> + TensorBlockExpr; + + typedef internal::TensorBlockAssignment<Scalar, NumInputDims, + TensorBlockExpr, Index> + TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target( + input_block_dims, + internal::strides<Layout>(this->m_impl.dimensions()), + this->m_impl.data(), this->srcCoeff(desc.offset())), + block.expr().reshape(input_block_dims)); + } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index b1d668744..b77d8fe84 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -53,18 +53,22 @@ struct TensorEvaluator RawAccess = true }; - typedef typename internal::TensorBlock< - typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout> + typedef typename internal::remove_const<Scalar>::type ScalarNoConst; + + typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout> TensorBlock; - typedef typename internal::TensorBlockReader< - typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout> + typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout> TensorBlockReader; - typedef typename internal::TensorBlockWriter< - typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout> + typedef typename internal::TensorBlockWriter<ScalarNoConst, Index, NumCoords, Layout> TensorBlockWriter; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords, + Layout, Index> + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) @@ -161,6 +165,12 @@ struct TensorEvaluator TensorBlockReader::Run(block, m_data); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + assert(m_data != NULL); + return TensorBlockV2::materialize(m_data, m_dims, desc, scratch); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( const TensorBlock& block) { assert(m_data != NULL); @@ -269,11 +279,6 @@ struct TensorEvaluator<const Derived, Device> typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc; typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; - typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumCoords, Layout> - TensorBlockIO; - typedef typename TensorBlockIO::Dst TensorBlockIODst; - typedef typename TensorBlockIO::Src TensorBlockIOSrc; - typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords, Layout, Index> TensorBlockV2; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 97ac96db1..6ad6327a6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -521,6 +521,19 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { Evaluator evaluator(expr, device); + Index total_size = array_prod(evaluator.dimensions()); + Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + + // TODO(ezuhulenev): For small expressions cost of block mapping and + // resource requirements gathering dominates the cost of expression + // evaluatiuon. + if (total_size < cache_size && + !ExpressionHasTensorBroadcastingOp<Expression>::value) { + internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, + /*Tiling=*/TiledEvaluation::Off>::run(expr, device); + evaluator.cleanup(); + return; + } const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); if (needs_assign) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 8d45bd62a..d98af1355 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -97,21 +97,26 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device> IsAligned = true, PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), BlockAccess = internal::is_arithmetic<CoeffReturnType>::value, - BlockAccessV2 = false, + BlockAccessV2 = internal::is_arithmetic<CoeffReturnType>::value, PreferBlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, RawAccess = true }; - typedef typename internal::TensorBlock< - CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout> + static const int NumDims = internal::traits<ArgType>::NumDimensions; + + typedef typename internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout> TensorBlock; - typedef typename internal::TensorBlockReader< - CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout> + typedef typename internal::TensorBlockReader<CoeffReturnType, Index, NumDims, Layout> TensorBlockReader; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, + Layout, Index> + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -170,6 +175,12 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device> TensorBlockReader::Run(block, m_buffer); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + assert(m_buffer != NULL); + return TensorBlockV2::materialize(m_buffer, m_impl.dimensions(), desc, scratch); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 5d4b0f061..c9d78ba9b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -644,6 +644,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi } } + // No strides for scalars. + if (NumDims == 0) return; + const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); const Sizes& output_dims = op.sizes(); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index f3907be6e..a0b4e04b1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -334,8 +334,12 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device // Want to copy from input. (output_inner_dim_size - output_inner_pad_before_size), // Can copy from input. - (static_cast<Index>(m_impl.dimensions()[inner_dim_idx]) - - numext::maxi(input_offsets[inner_dim_idx], Index(0)))); + numext::maxi( + static_cast<Index>(m_impl.dimensions()[inner_dim_idx]) - + (input_offsets[inner_dim_idx] + output_inner_pad_before_size), + Index(0))); + + eigen_assert(output_inner_copy_size >= 0); // How many values to fill with padding AFTER reading from the input inner // dimension. |