From 83c0a16baf5ecac6288cd9b74536a82de8985b31 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 31 Jul 2018 15:56:31 -0700 Subject: Add block evaluation support to TensorOps --- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 237 ++++++++++++++++++--- 1 file changed, 210 insertions(+), 27 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 6b54f40ad..3add9dac0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -100,6 +100,7 @@ class TensorShufflingOp : public TensorBase template struct TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Self; typedef TensorShufflingOp XprType; typedef typename XprType::Index Index; static const int NumDims = internal::array_size::Dimensions>::value; @@ -110,44 +111,60 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + using TensorBlockReader = internal::TensorBlockReader; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation()) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Shuffle& shuffle = op.shufflePermutation(); + m_is_identity = true; for (int i = 0; i < NumDims; ++i) { m_dimensions[i] = input_dims[shuffle[i]]; + m_inverseShuffle[shuffle[i]] = i; + if (m_is_identity && shuffle[i] != i) { + m_is_identity = false; + } } - array inputStrides; - if (static_cast(Layout) == static_cast(ColMajor)) { - inputStrides[0] = 1; + m_unshuffledInputStrides[0] = 1; m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { - inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1]; + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } else { - inputStrides[NumDims - 1] = 1; + m_unshuffledInputStrides[NumDims - 1] = 1; m_outputStrides[NumDims - 1] = 1; for (int i = NumDims - 2; i >= 0; --i) { - inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] = inputStrides[shuffle[i]]; + m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; } + + m_block_total_size_max = + numext::maxi(1, device.firstLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -162,29 +179,151 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_impl.coeff(srcCoeff(index)); + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } } + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static PacketReturnType Run(const Self& self, Index index) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + }; + + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static PacketReturnType Run(const Self& self, Index index) { + if (self.m_is_identity) { + return self.m_impl.template packet(index); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + }; + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + return PacketLoader::PacketAccess>::Run(*this, index); + } - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kUniformAllDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (m_impl.data() != NULL) { + // Fast path: we have direct access to the data, so shuffle as we read. + TensorBlockReader::Run(output_block, + srcCoeff(output_block->first_coeff_index()), + m_inverseShuffle, + m_unshuffledInputStrides, + m_impl.data()); + return; + } + + // Slow path: read unshuffled block from the input and shuffle in-place. + // Initialize input block sizes using input-to-output shuffle map. + DSizes input_block_sizes; + for (Index i = 0; i < NumDims; ++i) { + input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]]; + } + + // Calculate input block strides. + DSizes input_block_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_block_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + input_block_strides[i] = + input_block_strides[i - 1] * input_block_sizes[i - 1]; + } + } else { + input_block_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + input_block_strides[i] = + input_block_strides[i + 1] * input_block_sizes[i + 1]; + } + } + + // Read input block. + TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + input_block_sizes, + input_block_strides, + Dimensions(m_unshuffledInputStrides), + output_block->data()); + + m_impl.block(&input_block); + + // Naive In-place shuffle: random IO but block size is O(L1 cache size). + // TODO(andydavis) Improve the performance of this in-place shuffle. + const Index total_size = input_block_sizes.TotalSize(); + std::vector bitmap(total_size, false); + ScalarNoConst* data = const_cast(output_block->data()); + const DSizes& output_block_strides = + output_block->block_strides(); + for (Index input_index = 0; input_index < total_size; ++input_index) { + if (bitmap[input_index]) { + // Coefficient at this index has already been shuffled. + continue; + } + + Index output_index = GetBlockOutputIndex(input_index, input_block_strides, + output_block_strides); + if (output_index == input_index) { + // Coefficient already in place. + bitmap[output_index] = true; + continue; + } + + // The following loop starts at 'input_index', and shuffles + // coefficients into their shuffled location at 'output_index'. + // It skips through the array shuffling coefficients by following + // the shuffle cycle starting and ending a 'start_index'. + ScalarNoConst evicted_value; + ScalarNoConst shuffled_value = data[input_index]; + do { + evicted_value = data[output_index]; + data[output_index] = shuffled_value; + shuffled_value = evicted_value; + bitmap[output_index] = true; + output_index = GetBlockOutputIndex(output_index, input_block_strides, + output_block_strides); + } while (output_index != input_index); + + data[output_index] = shuffled_value; + bitmap[output_index] = true; } - PacketReturnType rslt = internal::pload(values); - return rslt; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + const double compute_cost = m_is_identity ? TensorOpCost::AddCost() : + NumDims * (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize); } EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } @@ -195,27 +334,57 @@ struct TensorEvaluator, Device> EIGEN_STRONG_INLINE const TensorEvaluator& impl() const {return m_impl;} protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( + Index input_index, + const DSizes& input_block_strides, + const DSizes& output_block_strides) const { + Index output_index = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = input_index / input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = input_index / input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[NumDims - 1]]; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } return inputIndex + index * m_inputStrides[0]; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } return inputIndex + index * m_inputStrides[NumDims - 1]; } } + Dimensions m_dimensions; + bool m_is_identity; + array m_inverseShuffle; array m_outputStrides; + array, NumDims> m_fastOutputStrides; array m_inputStrides; + array m_unshuffledInputStrides; + Index m_block_total_size_max; TensorEvaluator m_impl; /// required by sycl Shuffle m_shuffle; @@ -239,12 +408,18 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + using TensorBlockWriter = internal::TensorBlockWriter; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -265,6 +440,14 @@ struct TensorEvaluator, Device> this->coeffRef(index+i) = values[i]; } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()), + this->m_inverseShuffle, + this->m_unshuffledInputStrides, this->m_impl.data()); + } }; -- cgit v1.2.3