diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h | 247 |
1 files changed, 218 insertions, 29 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index e25dd9cf8..98f125408 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -100,6 +100,7 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> template<typename Shuffle, typename ArgType, typename Device> struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> { + typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self; typedef TensorShufflingOp<Shuffle, ArgType> XprType; typedef typename XprType::Index Index; static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; @@ -110,43 +111,61 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> static const int PacketSize = PacketType<CoeffReturnType, Device>::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation()) + typedef typename internal::remove_const<Scalar>::type ScalarNoConst; + + typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> + TensorBlock; + typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout> + TensorBlockReader; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_device(device), + m_impl(op.expression(), device), + m_shuffle(op.shufflePermutation()) { const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); const Shuffle& shuffle = op.shufflePermutation(); + m_is_identity = true; for (int i = 0; i < NumDims; ++i) { m_dimensions[i] = input_dims[shuffle[i]]; + m_inverseShuffle[shuffle[i]] = i; + if (m_is_identity && shuffle[i] != i) { + m_is_identity = false; + } } - array<Index, NumDims> inputStrides; - if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { - inputStrides[0] = 1; + m_unshuffledInputStrides[0] = 1; m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { - inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1]; + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); } } else { - inputStrides[NumDims - 1] = 1; + m_unshuffledInputStrides[NumDims - 1] = 1; m_outputStrides[NumDims - 1] = 1; for (int i = NumDims - 2; i >= 0; --i) { - inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]); } } for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] = inputStrides[shuffle[i]]; + m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; } } @@ -162,29 +181,152 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_impl.coeff(srcCoeff(index)); + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } } + template <int LoadMode, typename Self, bool ImplPacketAccess> + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static PacketReturnType Run(const Self& self, Index index) { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + }; + + template<int LoadMode, typename Self> + struct PacketLoader<LoadMode, Self, true> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static PacketReturnType Run(const Self& self, Index index) { + if (self.m_is_identity) { + return self.m_impl.template packet<LoadMode>(index); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload<PacketReturnType>(values); + return rslt; + } + } + }; + template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index); + } - EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + auto block_total_size_max = numext::maxi<Eigen::Index>( + 1, m_device.firstLevelCacheSize() / sizeof(Scalar)); + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kUniformAllDims, block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (m_impl.data() != NULL) { + // Fast path: we have direct access to the data, so shuffle as we read. + TensorBlockReader::Run(output_block, + srcCoeff(output_block->first_coeff_index()), + m_inverseShuffle, + m_unshuffledInputStrides, + m_impl.data()); + return; + } + + // Slow path: read unshuffled block from the input and shuffle in-place. + // Initialize input block sizes using input-to-output shuffle map. + DSizes<Index, NumDims> input_block_sizes; + for (Index i = 0; i < NumDims; ++i) { + input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]]; + } + + // Calculate input block strides. + DSizes<Index, NumDims> input_block_strides; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + input_block_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + input_block_strides[i] = + input_block_strides[i - 1] * input_block_sizes[i - 1]; + } + } else { + input_block_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + input_block_strides[i] = + input_block_strides[i + 1] * input_block_sizes[i + 1]; + } + } + + // Read input block. + TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + input_block_sizes, + input_block_strides, + Dimensions(m_unshuffledInputStrides), + output_block->data()); + + m_impl.block(&input_block); + + // Naive In-place shuffle: random IO but block size is O(L1 cache size). + // TODO(andydavis) Improve the performance of this in-place shuffle. + const Index total_size = input_block_sizes.TotalSize(); + std::vector<bool> bitmap(total_size, false); + ScalarNoConst* data = const_cast<ScalarNoConst*>(output_block->data()); + const DSizes<Index, NumDims>& output_block_strides = + output_block->block_strides(); + for (Index input_index = 0; input_index < total_size; ++input_index) { + if (bitmap[input_index]) { + // Coefficient at this index has already been shuffled. + continue; + } + + Index output_index = GetBlockOutputIndex(input_index, input_block_strides, + output_block_strides); + if (output_index == input_index) { + // Coefficient already in place. + bitmap[output_index] = true; + continue; + } + + // The following loop starts at 'input_index', and shuffles + // coefficients into their shuffled location at 'output_index'. + // It skips through the array shuffling coefficients by following + // the shuffle cycle starting and ending a 'start_index'. + ScalarNoConst evicted_value; + ScalarNoConst shuffled_value = data[input_index]; + do { + evicted_value = data[output_index]; + data[output_index] = shuffled_value; + shuffled_value = evicted_value; + bitmap[output_index] = true; + output_index = GetBlockOutputIndex(output_index, input_block_strides, + output_block_strides); + } while (output_index != input_index); + + data[output_index] = shuffled_value; + bitmap[output_index] = true; } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + + const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() : + NumDims * (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>()); return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize); } EIGEN_DEVICE_FUNC typename Eigen::internal::traits<XprType>::PointerType data() const { return NULL; } @@ -195,27 +337,58 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;} protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( + Index input_index, + const DSizes<Index, NumDims>& input_block_strides, + const DSizes<Index, NumDims>& output_block_strides) const { + Index output_index = 0; + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = input_index / input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = input_index / input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[NumDims - 1]]; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } return inputIndex + index * m_inputStrides[0]; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } return inputIndex + index * m_inputStrides[NumDims - 1]; } } + Dimensions m_dimensions; + bool m_is_identity; + array<Index, NumDims> m_inverseShuffle; array<Index, NumDims> m_outputStrides; + array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; array<Index, NumDims> m_inputStrides; + array<Index, NumDims> m_unshuffledInputStrides; + + const Device& m_device; TensorEvaluator<ArgType, Device> m_impl; /// required by sycl Shuffle m_shuffle; @@ -239,12 +412,20 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> static const int PacketSize = PacketType<CoeffReturnType, Device>::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), - BlockAccess = false, - RawAccess = false + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + RawAccess = false }; + typedef typename internal::remove_const<Scalar>::type ScalarNoConst; + + typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> + TensorBlock; + typedef internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout> + TensorBlockWriter; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -265,6 +446,14 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> this->coeffRef(index+i) = values[i]; } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()), + this->m_inverseShuffle, + this->m_unshuffledInputStrides, this->m_impl.data()); + } }; |