From 878845cb25c1ba9e56883fd0654eafb55a22fc34 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 28 Jun 2019 11:13:44 -0700 Subject: Add block access to TensorReverseOp and make sure that TensorForcedEval uses block access when preferred --- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 38 ++++- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 8 +- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 158 +++++++++++++++++++-- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 22 ++- 5 files changed, 203 insertions(+), 25 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 554ee5f59..910472ad8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -77,6 +77,8 @@ class TensorEvalToOp : public TensorBase, typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; + static const int NumDims = Eigen::internal::traits::NumDimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) : m_xpr(expr), m_buffer(buffer) {} @@ -105,15 +107,22 @@ struct TensorEvaluator, Device> static const int PacketSize = PacketType::size; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = true, PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true }; + typedef typename internal::TensorBlock< + CoeffReturnType, Index, internal::traits::NumDimensions, Layout> + TensorBlock; + typedef typename internal::TensorBlockReader< + CoeffReturnType, Index, internal::traits::NumDimensions, Layout> + TensorBlockReader; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer()), m_op(op), m_expression(op.expression()) @@ -143,6 +152,18 @@ struct TensorEvaluator, Device> internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) { + TensorBlock eval_to_block(block->first_coeff_index(), block->block_sizes(), + block->tensor_strides(), block->tensor_strides(), + m_buffer + block->first_coeff_index()); + m_impl.block(&eval_to_block); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -158,6 +179,11 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const { + assert(m_buffer != NULL); + TensorBlockReader::Run(block, m_buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { // We assume that evalPacket or evalScalar is called to perform the // assignment and account for the cost of the write here. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 7b5842571..647c98d4e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -346,7 +346,7 @@ class TensorExecutor= -1 && thread_idx < num_threads); - Scalar* thread_buf = reinterpret_cast( + ScalarNoConst* thread_buf = reinterpret_cast( static_cast(buf) + aligned_blocksize * (thread_idx + 1)); for (StorageIndex i = firstIdx; i < lastIdx; ++i) { auto block = block_mapper.GetBlockForIndex(i, thread_buf); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 74b905329..186457a31 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -126,8 +126,14 @@ struct TensorEvaluator, Device> } typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; EvalTo evalToTmp(m_buffer, m_op); + const bool Vectorize = internal::IsVectorizable::value; - internal::TensorExecutor::type, Vectorize>::run(evalToTmp, m_device); + const bool Tile = TensorEvaluator::BlockAccess && + TensorEvaluator::PreferBlockAccess; + + internal::TensorExecutor::type, + Vectorize, Tile>::run(evalToTmp, m_device); return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index b7fb969f3..33af7d995 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -111,18 +111,25 @@ struct TensorEvaluator, Device static const int PacketSize = PacketType::size; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = true, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false, + }; + typedef typename internal::remove_const::type ScalarNoConst; + typedef internal::TensorBlock + OutputTensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reverse(op.reverse()) + : m_impl(op.expression(), device), + m_reverse(op.reverse()), + m_device(device) { // Reversing a scalar isn't supported yet. It would be a no-op anyway. EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -140,6 +147,10 @@ struct TensorEvaluator, Device m_strides[i] = m_strides[i+1] * m_dimensions[i+1]; } } + // Remember the strides for fast division. + for (int i = 0; i < NumDims; ++i) { + m_fastStrides[i] = internal::TensorIntDivisor(m_strides[i]); + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -159,7 +170,7 @@ struct TensorEvaluator, Device Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - Index idx = index / m_strides[i]; + Index idx = index / m_fastStrides[i]; index -= idx * m_strides[i]; if (m_reverse[i]) { idx = m_dimensions[i] - idx - 1; @@ -173,7 +184,7 @@ struct TensorEvaluator, Device } } else { for (int i = 0; i < NumDims - 1; ++i) { - Index idx = index / m_strides[i]; + Index idx = index / m_fastStrides[i]; index -= idx * m_strides[i]; if (m_reverse[i]) { idx = m_dimensions[i] - idx - 1; @@ -212,6 +223,131 @@ struct TensorEvaluator, Device return rslt; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + Eigen::Index block_total_size_max = numext::maxi( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, block_total_size_max)); + } + + struct BlockIteratorState { + Index block_size; + Index block_stride; + Index block_span; + Index input_size; + Index input_stride; + Index input_span; + Index count; + bool reverse; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + if (NumDims <= 0) return; + + // TODO(ezhulenev): If underlying tensor expression supports and prefers + // block evaluation we must use it. Currently we use coeff and packet + // access into the underlying tensor expression. + // static const bool useBlockAccessForArgType = + // TensorEvaluator::BlockAccess && + // TensorEvaluator::PreferBlockAccess; + + static const bool isColMajor = + static_cast(Layout) == static_cast(ColMajor); + + static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1; + const bool inner_dim_reversed = m_reverse[inner_dim_idx]; + + CoeffReturnType* data = output_block->data(); + Index block_offset = 0; + + Index input_offset = reverseIndex(output_block->first_coeff_index()); + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (Index i = 0; i < NumDims; ++i) { + const Index dim = isColMajor ? i : NumDims - 1 - i; + it[i].block_size = output_block->block_sizes()[dim]; + it[i].block_stride = output_block->block_strides()[dim]; + it[i].block_span = it[i].block_stride * (it[i].block_size - 1); + it[i].input_size = m_dimensions[dim]; + it[i].input_stride = m_strides[dim]; + it[i].input_span = it[i].input_stride * (it[i].input_size - 1); + it[i].count = 0; + it[i].reverse = m_reverse[dim]; + + if (it[i].reverse) { + it[i].input_stride = -1 * it[i].input_stride; + it[i].input_span = -1 * it[i].input_span; + } + } + + // If multiple inner dimensions have the same reverse flag, check if we can + // merge them into a single virtual inner dimension. + int effective_inner_dim = 0; + for (int i = 1; i < NumDims; ++i) { + if (it[i].reverse != it[effective_inner_dim].reverse) break; + if (it[i].block_stride != it[effective_inner_dim].input_size) break; + if (it[i].block_stride != numext::abs(it[i].input_stride)) break; + + it[i].block_size = it[effective_inner_dim].block_size * it[i].block_size; + it[i].input_size = it[effective_inner_dim].input_size * it[i].input_size; + + it[i].block_stride = 1; + it[i].input_stride = (inner_dim_reversed ? -1 : 1); + + it[i].block_span = it[i].block_stride * (it[i].block_size - 1); + it[i].input_span = it[i].input_stride * (it[i].input_size - 1); + + effective_inner_dim = i; + } + + eigen_assert(it[effective_inner_dim].block_stride == 1); + eigen_assert(it[effective_inner_dim].input_stride == + (inner_dim_reversed ? -1 : 1)); + + const Index inner_dim_size = it[effective_inner_dim].block_size; + + while (it[NumDims - 1].count < it[NumDims - 1].block_size) { + // Copy inner-most dimension data from reversed location in input. + Index dst = block_offset; + Index src = input_offset; + + // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed + // worse results in benchmarks than a simple coefficient loop. + if (inner_dim_reversed) { + for (Index i = 0; i < inner_dim_size; ++i) { + data[dst] = m_impl.coeff(src); + ++dst; + --src; + } + } else { + for (Index i = 0; i < inner_dim_size; ++i) { + data[dst] = m_impl.coeff(src); + ++dst; + ++src; + } + } + + // For the 1d tensor we need to generate only one inner-most dimension. + if ((NumDims - effective_inner_dim) == 1) break; + + // Update offset. + for (Index i = effective_inner_dim + 1; i < NumDims; ++i) { + if (++it[i].count < it[i].block_size) { + block_offset += it[i].block_stride; + input_offset += it[i].input_stride; + break; + } + if (i != NumDims - 1) it[i].count = 0; + block_offset -= it[i].block_span; + input_offset -= it[i].input_span; + } + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + @@ -235,8 +371,10 @@ struct TensorEvaluator, Device protected: Dimensions m_dimensions; array m_strides; + array, NumDims> m_fastStrides; TensorEvaluator m_impl; ReverseDimensions m_reverse; + const Device& m_device; }; // Eval as lvalue diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 416948765..b577d4d36 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -270,6 +270,11 @@ struct TensorEvaluator, Device> input_block_strides[i + 1] * input_block_sizes[i + 1]; } } + DSizes, NumDims> fast_input_block_strides; + for (int i = 0; i < NumDims; ++i) { + fast_input_block_strides[i] = + internal::TensorIntDivisor(input_block_strides[i]); + } // Read input block. TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), @@ -293,8 +298,9 @@ struct TensorEvaluator, Device> continue; } - Index output_index = GetBlockOutputIndex(input_index, input_block_strides, - output_block_strides); + Index output_index = + GetBlockOutputIndex(input_index, input_block_strides, + output_block_strides, fast_input_block_strides); if (output_index == input_index) { // Coefficient already in place. bitmap[output_index] = true; @@ -312,8 +318,9 @@ struct TensorEvaluator, Device> data[output_index] = shuffled_value; shuffled_value = evicted_value; bitmap[output_index] = true; - output_index = GetBlockOutputIndex(output_index, input_block_strides, - output_block_strides); + output_index = + GetBlockOutputIndex(output_index, input_block_strides, + output_block_strides, fast_input_block_strides); } while (output_index != input_index); data[output_index] = shuffled_value; @@ -341,11 +348,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( Index input_index, const DSizes& input_block_strides, - const DSizes& output_block_strides) const { + const DSizes& output_block_strides, + const DSizes, NumDims>& fast_input_block_strides) const { Index output_index = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = input_index / input_block_strides[i]; + const Index idx = input_index / fast_input_block_strides[i]; output_index += idx * output_block_strides[m_inverseShuffle[i]]; input_index -= idx * input_block_strides[i]; } @@ -353,7 +361,7 @@ struct TensorEvaluator, Device> output_block_strides[m_inverseShuffle[0]]; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = input_index / input_block_strides[i]; + const Index idx = input_index / fast_input_block_strides[i]; output_index += idx * output_block_strides[m_inverseShuffle[i]]; input_index -= idx * input_block_strides[i]; } -- cgit v1.2.3