From 83c0a16baf5ecac6288cd9b74536a82de8985b31 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 31 Jul 2018 15:56:31 -0700 Subject: Add block evaluation support to TensorOps --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 329 ++++++++++++++++++--- 1 file changed, 291 insertions(+), 38 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 498488649..3747bff9e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -102,27 +102,64 @@ struct TensorEvaluator, Device> typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + static const int NumOutputDims = internal::array_size::value; + static const int NumInputDims = internal::array_size::Dimensions>::value; + enum { - IsAligned = TensorEvaluator::IsAligned, + IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + // TODO(andydavis, wuke) Enable BlockAccess for the general case when the + // performance issue with block-based reshape is resolved. + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::RawAccess && + NumInputDims > 0 && NumOutputDims > 0, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; + using ScalarNoConst = typename internal::remove_const::type; + + using InputTensorBlock = internal::TensorBlock; + using OutputTensorBlock = internal::TensorBlock; + using OutputTensorBlockReader = internal::TensorBlockReader; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { // The total size of the reshaped tensor must be equal to the total size // of the input tensor. eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); - } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; + if (BlockAccess) { + const typename TensorEvaluator::Dimensions& input_dims = + m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + m_inputStrides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + } else { + m_outputStrides[NumOutputDims - 1] = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + m_inputStrides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + } + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -148,6 +185,140 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + m_impl.getResourceRequirements(resources); + } + + // TODO(andydavis) Reduce the overhead of this function. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + if (m_impl.data() != NULL) { + OutputTensorBlockReader::Run(output_block, m_impl.data()); + return; + } + + // Calculate output block unit-stride inner dimension length. + const DSizes& output_block_sizes = + output_block->block_sizes(); + Index output_inner_dim_size = 1; + Index output_outer_dim_start = NumOutputDims; + for (Index i = 0; i < NumOutputDims; ++i) { + const Index dim = static_cast(Layout) == static_cast(ColMajor) + ? i : NumOutputDims - i - 1; + output_inner_dim_size *= output_block_sizes[dim]; + if (output_block_sizes[dim] < m_dimensions[dim]) { + output_outer_dim_start = i + 1; + break; + } + } + + // Initialize output block iterator state. + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + array block_iter_state; + + for (Index i = 0; i < NumOutputDims; ++i) { + const Index dim = static_cast(Layout) == static_cast(ColMajor) + ? i : NumOutputDims - i - 1; + block_iter_state[i].size = output_block_sizes[dim]; + block_iter_state[i].stride = m_outputStrides[dim]; + block_iter_state[i].span = + block_iter_state[i].stride * (block_iter_state[i].size - 1); + block_iter_state[i].count = 0; + } + + const Index output_outer_dim_size = output_block_sizes.TotalSize() / + output_inner_dim_size; + const typename TensorEvaluator::Dimensions& input_dims = + m_impl.dimensions(); + + Index index = output_block->first_coeff_index(); + for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) { + Index inner_idx = 0; + while (inner_idx < output_inner_dim_size) { + // Calculate input coords based on 'index'. + array input_coords; + Index idx = index; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumInputDims - 1; i > 0; --i) { + input_coords[i] = idx / m_inputStrides[i]; + idx -= input_coords[i] * m_inputStrides[i]; + } + input_coords[0] = idx; + } else { + for (int i = 0; i < NumInputDims - 1; ++i) { + input_coords[i] = idx / m_inputStrides[i]; + idx -= input_coords[i] * m_inputStrides[i]; + } + input_coords[NumInputDims - 1] = idx; + } + + // Calculate target input block shape, using at most + // 'output_inner_dim_size' coefficients along the input block's inner + // dimensions. + DSizes input_block_sizes; + Index num_to_allocate = output_inner_dim_size - inner_idx; + for (Index i = 0; i < NumInputDims; ++i) { + const Index dim = + static_cast(Layout) == static_cast(ColMajor) + ? i : NumInputDims - i - 1; + input_block_sizes[dim] = numext::mini( + num_to_allocate, (static_cast(input_dims[dim]) - + input_coords[dim])); + if (input_coords[dim] == 0) { + num_to_allocate /= input_block_sizes[dim]; + } else { + num_to_allocate = 1; + } + } + + // Calculate input block strides. + DSizes input_block_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_block_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_block_strides[i] = input_block_strides[i - 1] * + input_block_sizes[i - 1]; + } + } else { + input_block_strides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_block_strides[i] = input_block_strides[i + 1] * + input_block_sizes[i + 1]; + } + } + + // Instantiate and read input block from input tensor. + InputTensorBlock input_block(index, input_block_sizes, + input_block_strides, m_inputStrides, + output_block->data() + outer_idx * + output_inner_dim_size + inner_idx); + + m_impl.block(&input_block); + + const Index input_block_total_size = input_block_sizes.TotalSize(); + index += input_block_total_size; + inner_idx += input_block_total_size; + } + eigen_assert(inner_idx == output_inner_dim_size); + index -= output_inner_dim_size; + // Update index. + for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) { + if (++block_iter_state[i].count < block_iter_state[i].size) { + index += block_iter_state[i].stride; + break; + } + block_iter_state[i].count = 0; + index -= block_iter_state[i].span; + } + } + } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return const_cast(m_impl.data()); } EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } @@ -155,6 +326,8 @@ struct TensorEvaluator, Device> protected: TensorEvaluator m_impl; NewDimensions m_dimensions; + DSizes m_outputStrides; + DSizes m_inputStrides; }; @@ -322,17 +495,27 @@ struct TensorEvaluator, Devi typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets and sizes. - IsAligned = /*TensorEvaluator::IsAligned*/false, + IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { @@ -340,6 +523,16 @@ struct TensorEvaluator, Devi eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); } + m_is_identity = true; + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= + op.sizes()[i] + op.startIndices()[i]); + if (m_impl.dimensions()[i] != op.sizes()[i] || + op.startIndices()[i] != 0) { + m_is_identity = false; + } + } + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Sizes& output_dims = op.sizes(); if (static_cast(Layout) == static_cast(ColMajor)) { @@ -367,13 +560,10 @@ struct TensorEvaluator, Devi m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } - } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -417,7 +607,11 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_impl.coeff(srcCoeff(index)); + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } } template @@ -427,6 +621,10 @@ struct TensorEvaluator, Devi EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); + if (m_is_identity) { + return m_impl.template packet(index); + } + Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -469,9 +667,26 @@ struct TensorEvaluator, Devi } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + output_block->block_sizes(), + output_block->block_strides(), + Dimensions(m_inputStrides), + output_block->data()); + m_impl.block(&input_block); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { Scalar* result = m_impl.data(); @@ -544,7 +759,9 @@ struct TensorEvaluator, Devi TensorEvaluator m_impl; const Device& m_device; Dimensions m_dimensions; + bool m_is_identity; const StartIndices m_offsets; + Index m_block_total_size_max; }; @@ -557,33 +774,46 @@ struct TensorEvaluator, Device> typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, + IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess }; + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - return this->m_impl.coeffRef(this->srcCoeff(index)); + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { + if (this->m_is_identity) { + this->m_impl.template writePacket(index, x); + return; + } + const int packetSize = internal::unpacket_traits::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; @@ -623,6 +853,14 @@ struct TensorEvaluator, Device> } } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + this->m_impl.writeBlock(TensorBlock( + this->srcCoeff(block.first_coeff_index()), block.block_sizes(), + block.block_strides(), Dimensions(this->m_inputStrides), + const_cast(block.data()))); + } }; @@ -739,7 +977,13 @@ struct TensorEvaluator startIndicesClamped, stopIndicesClamped; + m_is_identity = true; for (size_t i = 0; i < internal::array_size::value; ++i) { + if (m_strides[i] != 1 || op.startIndices()[i] != 0 || + op.stopIndices()[i] != (m_impl.dimensions()[i] - 1)) { + m_is_identity = false; + } + eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); if(m_strides[i]>0){ startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); @@ -822,11 +1066,15 @@ struct TensorEvaluator::PointerType data() const { @@ -873,6 +1121,7 @@ struct TensorEvaluator m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; + bool m_is_identity; TensorEvaluator m_impl; const Device& m_device; DSizes m_startIndices; // clamped startIndices @@ -916,7 +1165,11 @@ struct TensorEvaluatorm_impl.coeffRef(this->srcCoeff(index)); + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } } }; -- cgit v1.2.3