From 83c0a16baf5ecac6288cd9b74536a82de8985b31 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 31 Jul 2018 15:56:31 -0700 Subject: Add block evaluation support to TensorOps --- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 333 +++++++++++++++++++-- 1 file changed, 316 insertions(+), 17 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 343ab6269..a851e7f55 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -108,16 +108,29 @@ struct TensorEvaluator, Device> bool isCopy= false, nByOne = false, oneByN = false; enum { - IsAligned = true, + IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_broadcast(op.broadcast()),m_impl(op.expression(), device) - { + using ScalarNoConst = typename internal::remove_const::type; + + // Block based access to the XprType (input) tensor. + using TensorBlock = internal::TensorBlock; + using TensorBlockReader = internal::TensorBlockReader; + // We do block based broadcasting using a a trick with 2x tensor rank and 0 + // strides. See block method implementation for details. + using BroadcastDimensions = DSizes; + using BroadcastTensorBlock = internal::TensorBlock; + using BroadcastTensorBlockReader = internal::TensorBlockReader; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_device(device), + m_broadcast(op.broadcast()), + m_impl(op.expression(), device) { // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. @@ -216,8 +229,7 @@ struct TensorEvaluator, Device> } // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const - { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; @@ -243,11 +255,15 @@ struct TensorEvaluator, Device> inputIndex += (index % m_impl.dimensions()[0]); } } - return m_impl.coeff(inputIndex); + return inputIndex; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const { + return m_impl.coeff(indexColMajor(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { Index inputIndex = 0; for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; @@ -263,17 +279,22 @@ struct TensorEvaluator, Device> } index -= idx * m_outputStrides[i]; } - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims - 1]); inputIndex += index; } else { - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); } else { - inputIndex += (index % m_impl.dimensions()[NumDims-1]); + inputIndex += (index % m_impl.dimensions()[NumDims - 1]); } } - return m_impl.coeff(inputIndex); + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + return m_impl.coeff(indexRowMajor(index)); } template @@ -553,13 +574,291 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large + // tensors. But this might need further tuning. + Index l1_cache_scalars = m_device.firstLevelCacheSize() / sizeof(Scalar); + Index block_total_size_max = numext::maxi(Index(1), l1_cache_scalars); + + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + block_total_size_max)); + + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (NumDims <= 0) { + output_block->data()[0] = m_impl.coeff(0); + return; + } + + // Because we only support kSkewedInnerDims blocking, block size should be + // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size + // for the first outer dim, and 1 for other outer dims. This is guaranteed + // by MergeResourceRequirements() in TensorBlock.h. + const auto& output_block_sizes = output_block->block_sizes(); + const auto& output_block_strides = output_block->block_strides(); + + // Find where outer dims start. + int outer_dim_start = 0; + Index outer_dim_size = 1, inner_dim_size = 1; + for (int i = 0; i < NumDims; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : NumDims - i - 1; + if (i > outer_dim_start) { + eigen_assert(output_block_sizes[dim] == 1); + } else if (output_block_sizes[dim] != m_dimensions[dim]) { + eigen_assert(output_block_sizes[dim] < m_dimensions[dim]); + outer_dim_size = output_block_sizes[dim]; + } else { + inner_dim_size *= output_block_sizes[dim]; + ++outer_dim_start; + } + } + + if (inner_dim_size == 0 || outer_dim_size == 0) { + return; + } + + const auto& input_dims = m_impl.dimensions(); + + // Pre-fill input_block_sizes, broadcast_block_sizes, + // broadcast_block_strides, and broadcast_tensor_strides. Later on we will + // only modify the outer_dim_start-th dimension on these arrays. + + // Calculate the input block size for looking into the input. + Dimensions input_block_sizes; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < outer_dim_start; ++i) { + input_block_sizes[i] = input_dims[i]; + } + for (int i = outer_dim_start; i < NumDims; ++i) { + input_block_sizes[i] = 1; + } + } else { + for (int i = 0; i < outer_dim_start; ++i) { + input_block_sizes[NumDims - i - 1] = input_dims[NumDims - i - 1]; + } + for (int i = outer_dim_start; i < NumDims; ++i) { + input_block_sizes[NumDims - i - 1] = 1; + } + } + + // Broadcast with the 0-stride trick: Create 1 extra dim for each + // broadcast, set the input stride to 0. + // + // When ColMajor: + // - broadcast_block_sizes is [d_0, b_0, d_1, b_1, ...]. + // + // - broadcast_block_strides is [output_block_strides[0], + // output_block_strides[0] * d_0, + // output_block_strides[1], + // output_block_strides[1] * d_1, + // ...]. + // + // - broadcast_tensor_strides is [output_block_strides[0], + // 0, + // output_block_strides[1], + // 0, + // ...]. + BroadcastDimensions broadcast_block_sizes, broadcast_block_strides, + broadcast_tensor_strides; + + for (int i = 0; i < outer_dim_start; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : NumDims - i - 1; + const int copy_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * i + : 2 * NumDims - 2 * i - 1; + const int broadcast_dim = + static_cast(Layout) == static_cast(ColMajor) ? copy_dim + 1 + : copy_dim - 1; + broadcast_block_sizes[copy_dim] = input_dims[dim]; + broadcast_block_sizes[broadcast_dim] = m_broadcast[dim]; + broadcast_block_strides[copy_dim] = output_block_strides[dim]; + broadcast_block_strides[broadcast_dim] = + output_block_strides[dim] * input_dims[dim]; + broadcast_tensor_strides[copy_dim] = m_inputStrides[dim]; + broadcast_tensor_strides[broadcast_dim] = 0; + } + for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : 2 * NumDims - i - 1; + broadcast_block_sizes[dim] = 1; + broadcast_block_strides[dim] = 0; + broadcast_tensor_strides[dim] = 0; + } + + const int outer_dim = static_cast(Layout) == static_cast(ColMajor) + ? outer_dim_start + : NumDims - outer_dim_start - 1; + + if (outer_dim_size == 1) { + // We just need one block read using the ready-set values above. + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } else if (input_dims[outer_dim] == 1) { + // Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size. + const int broadcast_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + 1 + : 2 * NumDims - 2 * outer_dim_start - 2; + broadcast_block_sizes[broadcast_outer_dim] = outer_dim_size; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } else { + // The general case. Let's denote the output block as x[..., + // a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on + // the outer_dim_start-th dimension (< NumDims). We need to split the + // a:a+outer_dim_size into possibly 3 sub-blocks: + // + // (1) a:b, where b is the smallest multiple of + // input_dims[outer_dim_start] in [a, a+outer_dim_size]. + // + // (2) b:c, where c is the largest multiple of input_dims[outer_dim_start] + // in [a, a+outer_dim_size]. + // + // (3) c:a+outer_dim_size . + // + // Or, when b and c do not exist, we just need to process the whole block + // together. + + // Find a. + const Index outer_dim_left_index = + output_block->first_coeff_index() / m_outputStrides[outer_dim]; + + // Find b and c. + const Index input_outer_dim_size = input_dims[outer_dim]; + + // First multiple after a. This is b when <= outer_dim_left_index + + // outer_dim_size. + const Index first_multiple = + divup(outer_dim_left_index, input_outer_dim_size) * + input_outer_dim_size; + + if (first_multiple <= outer_dim_left_index + outer_dim_size) { + // b exists, so does c. Find it. + const Index last_multiple = (outer_dim_left_index + outer_dim_size) / + input_outer_dim_size * input_outer_dim_size; + const int copy_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + : 2 * NumDims - 2 * outer_dim_start - 1; + const int broadcast_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + 1 + : 2 * NumDims - 2 * outer_dim_start - 2; + if (first_multiple > outer_dim_left_index) { + const Index head_size = first_multiple - outer_dim_left_index; + input_block_sizes[outer_dim] = head_size; + broadcast_block_sizes[copy_outer_dim] = head_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + broadcast_block_sizes[broadcast_outer_dim] = 1; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim] * input_dims[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } + if (first_multiple < last_multiple) { + input_block_sizes[outer_dim] = input_outer_dim_size; + broadcast_block_sizes[copy_outer_dim] = input_outer_dim_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + broadcast_block_sizes[broadcast_outer_dim] = + (last_multiple - first_multiple) / input_outer_dim_size; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim] * input_dims[outer_dim]; + const Index offset = (first_multiple - outer_dim_left_index) * + m_outputStrides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, + offset, output_block); + } + if (last_multiple < outer_dim_left_index + outer_dim_size) { + const Index tail_size = + outer_dim_left_index + outer_dim_size - last_multiple; + input_block_sizes[outer_dim] = tail_size; + broadcast_block_sizes[copy_outer_dim] = tail_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + broadcast_block_sizes[broadcast_outer_dim] = 1; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim] * input_dims[outer_dim]; + const Index offset = (last_multiple - outer_dim_left_index) * + m_outputStrides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, + offset, output_block); + } + } else { + // b and c do not exist. + const int copy_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + : 2 * NumDims - 2 * outer_dim_start - 1; + input_block_sizes[outer_dim] = outer_dim_size; + broadcast_block_sizes[copy_outer_dim] = outer_dim_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } + } + } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } Broadcast functor() const { return m_broadcast; } + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock( + const Dimensions& input_block_sizes, + const BroadcastDimensions& broadcast_block_sizes, + const BroadcastDimensions& broadcast_block_strides, + const BroadcastDimensions& broadcast_tensor_strides, Index offset, + TensorBlock* output_block) const { + TensorBlock input_view_block( + static_cast(Layout) == static_cast(ColMajor) + ? indexColMajor(output_block->first_coeff_index() + offset) + : indexRowMajor(output_block->first_coeff_index() + offset), + input_block_sizes, Dimensions(m_inputStrides), + Dimensions(m_inputStrides), NULL); + + internal::TensorBlockView input_block(m_device, m_impl, + input_view_block); + BroadcastTensorBlock broadcast_block( + 0, broadcast_block_sizes, broadcast_block_strides, + broadcast_tensor_strides, output_block->data() + offset); + + BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data()); + } + protected: + const Device& m_device; const Broadcast m_broadcast; Dimensions m_dimensions; array m_outputStrides; -- cgit v1.2.3