From 83c0a16baf5ecac6288cd9b74536a82de8985b31 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 31 Jul 2018 15:56:31 -0700 Subject: Add block evaluation support to TensorOps --- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 14 +- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 333 +++++++++++++++- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 144 ++++++- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 218 +++++++++- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 329 ++++++++++++++-- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 437 ++++++++++++++++++++- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 237 +++++++++-- 7 files changed, 1602 insertions(+), 110 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 84cf6d216..33c4ef5b7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -214,7 +214,7 @@ class TensorBlockIO { num_size_one_inner_dims, NumDims - num_size_one_inner_dims - 1); const StorageIndex block_dim_for_tensor_stride1_dim = NumDims == 0 ? 1 : tensor_to_block_dim_map[tensor_stride1_dim]; - size_t block_inner_dim_size = + Index block_inner_dim_size = NumDims == 0 ? 1 : block.block_sizes()[block_dim_for_tensor_stride1_dim]; for (int i = num_size_one_inner_dims + 1; i < NumDims; ++i) { @@ -745,16 +745,15 @@ class TensorBlockMapper { if (block_shape == TensorBlockShapeType::kUniformAllDims) { // Tensor will not fit within 'min_target_size' budget: calculate tensor // block dimension sizes based on "square" dimension size target. - const size_t dim_size_target = static_cast( + const Index dim_size_target = static_cast( std::pow(static_cast(min_target_size), 1.0 / static_cast(block_dim_sizes.rank()))); - for (size_t i = 0; i < block_dim_sizes.rank(); ++i) { + for (Index i = 0; i < block_dim_sizes.rank(); ++i) { // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it // a multiple of the packet size. Note that reducing // 'block_dim_size' in this manner can increase the number of // blocks, and so will amplify any per-block overhead. - block_dim_sizes[i] = numext::mini( - dim_size_target, static_cast(tensor_dims[i])); + block_dim_sizes[i] = numext::mini(dim_size_target, tensor_dims[i]); } // Add any un-allocated coefficients to inner dimension(s). StorageIndex total_size = block_dim_sizes.TotalSize(); @@ -789,9 +788,8 @@ class TensorBlockMapper { } } - eigen_assert( - block_dim_sizes.TotalSize() >= - numext::mini(min_target_size, tensor_dims.TotalSize())); + eigen_assert(block_dim_sizes.TotalSize() >= + numext::mini(min_target_size, tensor_dims.TotalSize())); return block_dim_sizes; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 343ab6269..a851e7f55 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -108,16 +108,29 @@ struct TensorEvaluator, Device> bool isCopy= false, nByOne = false, oneByN = false; enum { - IsAligned = true, + IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_broadcast(op.broadcast()),m_impl(op.expression(), device) - { + using ScalarNoConst = typename internal::remove_const::type; + + // Block based access to the XprType (input) tensor. + using TensorBlock = internal::TensorBlock; + using TensorBlockReader = internal::TensorBlockReader; + // We do block based broadcasting using a a trick with 2x tensor rank and 0 + // strides. See block method implementation for details. + using BroadcastDimensions = DSizes; + using BroadcastTensorBlock = internal::TensorBlock; + using BroadcastTensorBlockReader = internal::TensorBlockReader; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : m_device(device), + m_broadcast(op.broadcast()), + m_impl(op.expression(), device) { // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. @@ -216,8 +229,7 @@ struct TensorEvaluator, Device> } // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const - { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { Index inputIndex = 0; for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; @@ -243,11 +255,15 @@ struct TensorEvaluator, Device> inputIndex += (index % m_impl.dimensions()[0]); } } - return m_impl.coeff(inputIndex); + return inputIndex; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const { + return m_impl.coeff(indexColMajor(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { Index inputIndex = 0; for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; @@ -263,17 +279,22 @@ struct TensorEvaluator, Device> } index -= idx * m_outputStrides[i]; } - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims - 1]); inputIndex += index; } else { - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); } else { - inputIndex += (index % m_impl.dimensions()[NumDims-1]); + inputIndex += (index % m_impl.dimensions()[NumDims - 1]); } } - return m_impl.coeff(inputIndex); + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + return m_impl.coeff(indexRowMajor(index)); } template @@ -553,13 +574,291 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large + // tensors. But this might need further tuning. + Index l1_cache_scalars = m_device.firstLevelCacheSize() / sizeof(Scalar); + Index block_total_size_max = numext::maxi(Index(1), l1_cache_scalars); + + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + block_total_size_max)); + + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (NumDims <= 0) { + output_block->data()[0] = m_impl.coeff(0); + return; + } + + // Because we only support kSkewedInnerDims blocking, block size should be + // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size + // for the first outer dim, and 1 for other outer dims. This is guaranteed + // by MergeResourceRequirements() in TensorBlock.h. + const auto& output_block_sizes = output_block->block_sizes(); + const auto& output_block_strides = output_block->block_strides(); + + // Find where outer dims start. + int outer_dim_start = 0; + Index outer_dim_size = 1, inner_dim_size = 1; + for (int i = 0; i < NumDims; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : NumDims - i - 1; + if (i > outer_dim_start) { + eigen_assert(output_block_sizes[dim] == 1); + } else if (output_block_sizes[dim] != m_dimensions[dim]) { + eigen_assert(output_block_sizes[dim] < m_dimensions[dim]); + outer_dim_size = output_block_sizes[dim]; + } else { + inner_dim_size *= output_block_sizes[dim]; + ++outer_dim_start; + } + } + + if (inner_dim_size == 0 || outer_dim_size == 0) { + return; + } + + const auto& input_dims = m_impl.dimensions(); + + // Pre-fill input_block_sizes, broadcast_block_sizes, + // broadcast_block_strides, and broadcast_tensor_strides. Later on we will + // only modify the outer_dim_start-th dimension on these arrays. + + // Calculate the input block size for looking into the input. + Dimensions input_block_sizes; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < outer_dim_start; ++i) { + input_block_sizes[i] = input_dims[i]; + } + for (int i = outer_dim_start; i < NumDims; ++i) { + input_block_sizes[i] = 1; + } + } else { + for (int i = 0; i < outer_dim_start; ++i) { + input_block_sizes[NumDims - i - 1] = input_dims[NumDims - i - 1]; + } + for (int i = outer_dim_start; i < NumDims; ++i) { + input_block_sizes[NumDims - i - 1] = 1; + } + } + + // Broadcast with the 0-stride trick: Create 1 extra dim for each + // broadcast, set the input stride to 0. + // + // When ColMajor: + // - broadcast_block_sizes is [d_0, b_0, d_1, b_1, ...]. + // + // - broadcast_block_strides is [output_block_strides[0], + // output_block_strides[0] * d_0, + // output_block_strides[1], + // output_block_strides[1] * d_1, + // ...]. + // + // - broadcast_tensor_strides is [output_block_strides[0], + // 0, + // output_block_strides[1], + // 0, + // ...]. + BroadcastDimensions broadcast_block_sizes, broadcast_block_strides, + broadcast_tensor_strides; + + for (int i = 0; i < outer_dim_start; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : NumDims - i - 1; + const int copy_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * i + : 2 * NumDims - 2 * i - 1; + const int broadcast_dim = + static_cast(Layout) == static_cast(ColMajor) ? copy_dim + 1 + : copy_dim - 1; + broadcast_block_sizes[copy_dim] = input_dims[dim]; + broadcast_block_sizes[broadcast_dim] = m_broadcast[dim]; + broadcast_block_strides[copy_dim] = output_block_strides[dim]; + broadcast_block_strides[broadcast_dim] = + output_block_strides[dim] * input_dims[dim]; + broadcast_tensor_strides[copy_dim] = m_inputStrides[dim]; + broadcast_tensor_strides[broadcast_dim] = 0; + } + for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : 2 * NumDims - i - 1; + broadcast_block_sizes[dim] = 1; + broadcast_block_strides[dim] = 0; + broadcast_tensor_strides[dim] = 0; + } + + const int outer_dim = static_cast(Layout) == static_cast(ColMajor) + ? outer_dim_start + : NumDims - outer_dim_start - 1; + + if (outer_dim_size == 1) { + // We just need one block read using the ready-set values above. + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } else if (input_dims[outer_dim] == 1) { + // Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size. + const int broadcast_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + 1 + : 2 * NumDims - 2 * outer_dim_start - 2; + broadcast_block_sizes[broadcast_outer_dim] = outer_dim_size; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } else { + // The general case. Let's denote the output block as x[..., + // a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on + // the outer_dim_start-th dimension (< NumDims). We need to split the + // a:a+outer_dim_size into possibly 3 sub-blocks: + // + // (1) a:b, where b is the smallest multiple of + // input_dims[outer_dim_start] in [a, a+outer_dim_size]. + // + // (2) b:c, where c is the largest multiple of input_dims[outer_dim_start] + // in [a, a+outer_dim_size]. + // + // (3) c:a+outer_dim_size . + // + // Or, when b and c do not exist, we just need to process the whole block + // together. + + // Find a. + const Index outer_dim_left_index = + output_block->first_coeff_index() / m_outputStrides[outer_dim]; + + // Find b and c. + const Index input_outer_dim_size = input_dims[outer_dim]; + + // First multiple after a. This is b when <= outer_dim_left_index + + // outer_dim_size. + const Index first_multiple = + divup(outer_dim_left_index, input_outer_dim_size) * + input_outer_dim_size; + + if (first_multiple <= outer_dim_left_index + outer_dim_size) { + // b exists, so does c. Find it. + const Index last_multiple = (outer_dim_left_index + outer_dim_size) / + input_outer_dim_size * input_outer_dim_size; + const int copy_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + : 2 * NumDims - 2 * outer_dim_start - 1; + const int broadcast_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + 1 + : 2 * NumDims - 2 * outer_dim_start - 2; + if (first_multiple > outer_dim_left_index) { + const Index head_size = first_multiple - outer_dim_left_index; + input_block_sizes[outer_dim] = head_size; + broadcast_block_sizes[copy_outer_dim] = head_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + broadcast_block_sizes[broadcast_outer_dim] = 1; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim] * input_dims[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } + if (first_multiple < last_multiple) { + input_block_sizes[outer_dim] = input_outer_dim_size; + broadcast_block_sizes[copy_outer_dim] = input_outer_dim_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + broadcast_block_sizes[broadcast_outer_dim] = + (last_multiple - first_multiple) / input_outer_dim_size; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim] * input_dims[outer_dim]; + const Index offset = (first_multiple - outer_dim_left_index) * + m_outputStrides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, + offset, output_block); + } + if (last_multiple < outer_dim_left_index + outer_dim_size) { + const Index tail_size = + outer_dim_left_index + outer_dim_size - last_multiple; + input_block_sizes[outer_dim] = tail_size; + broadcast_block_sizes[copy_outer_dim] = tail_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + broadcast_block_sizes[broadcast_outer_dim] = 1; + broadcast_tensor_strides[broadcast_outer_dim] = 0; + broadcast_block_strides[broadcast_outer_dim] = + output_block_strides[outer_dim] * input_dims[outer_dim]; + const Index offset = (last_multiple - outer_dim_left_index) * + m_outputStrides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, + offset, output_block); + } + } else { + // b and c do not exist. + const int copy_outer_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 2 * outer_dim_start + : 2 * NumDims - 2 * outer_dim_start - 1; + input_block_sizes[outer_dim] = outer_dim_size; + broadcast_block_sizes[copy_outer_dim] = outer_dim_size; + broadcast_tensor_strides[copy_outer_dim] = m_inputStrides[outer_dim]; + broadcast_block_strides[copy_outer_dim] = + output_block_strides[outer_dim]; + BroadcastBlock(input_block_sizes, broadcast_block_sizes, + broadcast_block_strides, broadcast_tensor_strides, 0, + output_block); + } + } + } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } Broadcast functor() const { return m_broadcast; } + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlock( + const Dimensions& input_block_sizes, + const BroadcastDimensions& broadcast_block_sizes, + const BroadcastDimensions& broadcast_block_strides, + const BroadcastDimensions& broadcast_tensor_strides, Index offset, + TensorBlock* output_block) const { + TensorBlock input_view_block( + static_cast(Layout) == static_cast(ColMajor) + ? indexColMajor(output_block->first_coeff_index() + offset) + : indexRowMajor(output_block->first_coeff_index() + offset), + input_block_sizes, Dimensions(m_inputStrides), + Dimensions(m_inputStrides), NULL); + + internal::TensorBlockView input_block(m_device, m_impl, + input_view_block); + BroadcastTensorBlock broadcast_block( + 0, broadcast_block_sizes, broadcast_block_strides, + broadcast_tensor_strides, output_block->data() + offset); + + BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data()); + } + protected: + const Device& m_device; const Broadcast m_broadcast; Dimensions m_dimensions; array m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 085c05f3d..a0d039e64 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -144,14 +144,19 @@ struct TensorEvaluator, Device> enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. - IsAligned = false, + IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using InputTensorBlock = internal::TensorBlock; + using OutputTensorBlock = internal::TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) { @@ -184,6 +189,23 @@ struct TensorEvaluator, Device> } m_inputStride *= input_dims[m_dim.actualDim()]; m_inputOffset = m_stride * op.offset(); + + if (BlockAccess) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStrides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -266,6 +288,60 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + // TODO(andydavis) Reduce the overhead of this function (experiment with + // using a fixed block size). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + // Calculate input block sizes. + const DSizes& output_block_sizes = + output_block->block_sizes(); + const DSizes& output_block_strides = + output_block->block_strides(); + const Index chip_dim = m_dim.actualDim(); + DSizes input_block_sizes; + DSizes input_block_strides; + for (Index i = 0; i < NumInputDims; ++i) { + if (i < chip_dim) { + input_block_sizes[i] = output_block_sizes[i]; + input_block_strides[i] = output_block_strides[i]; + } else if (i > chip_dim) { + input_block_sizes[i] = output_block_sizes[i - 1]; + input_block_strides[i] = output_block_strides[i - 1]; + } else { + input_block_sizes[i] = 1; + } + } + // Fix up input_block_stride for chip dimension. + if (static_cast(Layout) == static_cast(ColMajor)) { + if (chip_dim == 0) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1]; + } + } else { + if (chip_dim == NumInputDims - 1) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1]; + } + } + // Instantiate and read input block from input tensor. + InputTensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + input_block_sizes, input_block_strides, + m_inputStrides, output_block->data()); + m_impl.block(&input_block); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { CoeffReturnType* result = const_cast(m_impl.data()); if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || @@ -316,6 +392,8 @@ struct TensorEvaluator, Device> Index m_stride; Index m_inputOffset; Index m_inputStride; + Index m_block_total_size_max; + DSizes m_inputStrides; TensorEvaluator m_impl; const internal::DimensionId m_dim; const Device& m_device; @@ -342,12 +420,18 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using InputTensorBlock = internal::TensorBlock; + using OutputTensorBlock = internal::TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -395,6 +479,50 @@ struct TensorEvaluator, Device> } } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const OutputTensorBlock& output_block) { + // Calculate input block sizes. + const DSizes& output_block_sizes = + output_block.block_sizes(); + const DSizes& output_block_strides = + output_block.block_strides(); + const Index chip_dim = this->m_dim.actualDim(); + DSizes input_block_sizes; + DSizes input_block_strides; + for (Index i = 0; i < NumInputDims; ++i) { + if (i < chip_dim) { + input_block_sizes[i] = output_block_sizes[i]; + input_block_strides[i] = output_block_strides[i]; + } else if (i > chip_dim) { + input_block_sizes[i] = output_block_sizes[i - 1]; + input_block_strides[i] = output_block_strides[i - 1]; + } else { + input_block_sizes[i] = 1; + } + } + // Fix up input_block_stride for chip dimension. + if (static_cast(Layout) == static_cast(ColMajor)) { + if (chip_dim == 0) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim - 1] * input_block_sizes[chip_dim - 1]; + } + } else { + if (chip_dim == NumInputDims - 1) { + input_block_strides[chip_dim] = 1; + } else { + input_block_strides[chip_dim] = + input_block_strides[chip_dim + 1] * input_block_sizes[chip_dim + 1]; + } + } + // Write input block. + this->m_impl.writeBlock(InputTensorBlock( + this->srcCoeff(output_block.first_coeff_index()), input_block_sizes, + input_block_strides, this->m_inputStrides, + const_cast(output_block.data()))); + } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 72cb2d15f..4987b898b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -54,6 +54,66 @@ struct nested, 1, typename eval type; }; +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + const Index packet_size = internal::unpacket_traits::size; + const Index vectorized_size = + (num_coeff_to_copy / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = impl.template packet(src_index + i); + internal::pstoret(dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchPaddingOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Index num_coeff_to_pad, const Scalar padding_value, + const Index dst_index, Scalar* dst_data) { + const Index packet_size = internal::unpacket_traits::size; + const Packet padded_packet = internal::pset1(padding_value); + const Index vectorized_size = + (num_coeff_to_pad / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + internal::pstoret(dst_data + dst_index + i, + padded_packet); + } + for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) { + dst_data[dst_index + i] = padding_value; + } + } +}; + } // end namespace internal template @@ -184,15 +244,17 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + BlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; - #ifdef __SYCL_DEVICE_ONLY__ + using OutputTensorBlock = internal::TensorBlock; + +#ifdef __SYCL_DEVICE_ONLY__ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType op, const Device& device) #else EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) @@ -342,6 +404,9 @@ struct TensorEvaluator, Device> } else { m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); } + + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -484,6 +549,146 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + using ImagePatchCopyOp = internal::ImagePatchCopyOp; + using ImagePatchPaddingOp = internal::ImagePatchPaddingOp; + + // Calculate loop limits and various input/output dim sizes. + const DSizes& block_sizes = output_block->block_sizes(); + const bool col_major = + static_cast(Layout) == static_cast(ColMajor); + const Index depth_dim_size = block_sizes[col_major ? 0 : NumDims - 1]; + const Index output_depth_dim_size = + m_dimensions[col_major ? 0 : NumDims - 1]; + const Index row_dim_size = block_sizes[col_major ? 1 : NumDims - 2]; + const Index output_row_dim_size = m_dimensions[col_major ? 1 : NumDims - 2]; + const Index col_dim_size = block_sizes[col_major ? 2 : NumDims - 3]; + const Index block_col_stride = row_dim_size * depth_dim_size; + const Index patch_index_dim_size = block_sizes[col_major ? 3 : NumDims - 4]; + const Index outer_dim_size = + block_sizes.TotalSize() / + (depth_dim_size * row_dim_size * col_dim_size * patch_index_dim_size); + + const Index patch_size = row_dim_size * col_dim_size * depth_dim_size; + const Index batch_size = patch_size * patch_index_dim_size; + + Index output_index = output_block->first_coeff_index(); + + // Loop through outer dimensions. + for (Index outer_dim_index = 0; outer_dim_index < outer_dim_size; + ++outer_dim_index) { + const Index outer_output_base_index = outer_dim_index * batch_size; + // Find the offset of the element wrt the location of the first element. + const Index patchIndexStart = output_index / m_fastPatchStride; + const Index patchOffset = + (output_index - patchIndexStart * m_patchStride) / m_fastOutputDepth; + const Index colOffsetStart = patchOffset / m_fastColStride; + // Other ways to index this element. + const Index otherIndex = + (NumDims == 4) ? 0 : output_index / m_fastOtherStride; + const Index patch2DIndexStart = + (NumDims == 4) + ? 0 + : (output_index - otherIndex * m_otherStride) / m_fastPatchStride; + // Calculate starting depth index. + const Index depth = output_index - (output_index / m_fastOutputDepth) * + output_depth_dim_size; + const Index patch_input_base_index = + depth + otherIndex * m_patchInputStride; + + // Loop through patches. + for (Index patch_index_dim_index = 0; + patch_index_dim_index < patch_index_dim_size; + ++patch_index_dim_index) { + const Index patch_output_base_index = + outer_output_base_index + patch_index_dim_index * patch_size; + // Patch index corresponding to the passed in index. + const Index patchIndex = patchIndexStart + patch_index_dim_index; + const Index patch2DIndex = + (NumDims == 4) ? patchIndex + : patch2DIndexStart + patch_index_dim_index; + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index input_col_base = colIndex * m_col_strides; + const Index row_offset_base = + (patch2DIndex - colIndex * m_outputRows) * m_row_strides - + m_rowPaddingTop; + + // Loop through columns. + for (Index col_dim_index = 0; col_dim_index < col_dim_size; + ++col_dim_index) { + const Index col_output_base_index = + patch_output_base_index + col_dim_index * block_col_stride; + + // Calculate col index in the input original tensor. + Index colOffset = colOffsetStart + col_dim_index; + Index inputCol = + input_col_base + colOffset * m_in_col_strides - m_colPaddingLeft; + Index origInputCol = + (m_col_inflate_strides == 1) + ? inputCol + : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); + + bool pad_column = false; + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && + (inputCol != origInputCol * m_col_inflate_strides))) { + pad_column = true; + } + + const Index col_input_base_index = + patch_input_base_index + origInputCol * m_colInputStride; + const Index input_row_base = + row_offset_base + + ((patchOffset + col_dim_index * output_row_dim_size) - + colOffset * m_colStride) * + m_in_row_strides; + // Loop through rows. + for (Index row_dim_index = 0; row_dim_index < row_dim_size; + ++row_dim_index) { + const Index output_base_index = + col_output_base_index + row_dim_index * depth_dim_size; + bool pad_row = false; + Index inputIndex; + if (!pad_column) { + Index inputRow = + input_row_base + row_dim_index * m_in_row_strides; + Index origInputRow = + (m_row_inflate_strides == 1) + ? inputRow + : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) + : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && + (inputRow != origInputRow * m_row_inflate_strides))) { + pad_row = true; + } else { + inputIndex = + col_input_base_index + origInputRow * m_rowInputStride; + } + } + // Copy (or pad) along depth dimension. + if (pad_column || pad_row) { + ImagePatchPaddingOp::Run(depth_dim_size, Scalar(m_paddingValue), + output_base_index, output_block->data()); + } else { + ImagePatchCopyOp::Run(*this, depth_dim_size, output_base_index, + output_block->data(), inputIndex); + } + } + } + } + output_index += m_otherStride; + } + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { @@ -538,6 +743,7 @@ struct TensorEvaluator, Device> internal::TensorIntDivisor m_fastOutputDepth; Scalar m_paddingValue; + Index m_block_total_size_max; TensorEvaluator m_impl; #ifdef EIGEN_USE_SYCL diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 498488649..3747bff9e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -102,27 +102,64 @@ struct TensorEvaluator, Device> typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + static const int NumOutputDims = internal::array_size::value; + static const int NumInputDims = internal::array_size::Dimensions>::value; + enum { - IsAligned = TensorEvaluator::IsAligned, + IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + // TODO(andydavis, wuke) Enable BlockAccess for the general case when the + // performance issue with block-based reshape is resolved. + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::RawAccess && + NumInputDims > 0 && NumOutputDims > 0, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; + using ScalarNoConst = typename internal::remove_const::type; + + using InputTensorBlock = internal::TensorBlock; + using OutputTensorBlock = internal::TensorBlock; + using OutputTensorBlockReader = internal::TensorBlockReader; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { // The total size of the reshaped tensor must be equal to the total size // of the input tensor. eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); - } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; + if (BlockAccess) { + const typename TensorEvaluator::Dimensions& input_dims = + m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + m_inputStrides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1]; + } + } else { + m_outputStrides[NumOutputDims - 1] = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + m_inputStrides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1]; + } + } + } + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -148,6 +185,140 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + m_impl.getResourceRequirements(resources); + } + + // TODO(andydavis) Reduce the overhead of this function. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + OutputTensorBlock* output_block) const { + if (m_impl.data() != NULL) { + OutputTensorBlockReader::Run(output_block, m_impl.data()); + return; + } + + // Calculate output block unit-stride inner dimension length. + const DSizes& output_block_sizes = + output_block->block_sizes(); + Index output_inner_dim_size = 1; + Index output_outer_dim_start = NumOutputDims; + for (Index i = 0; i < NumOutputDims; ++i) { + const Index dim = static_cast(Layout) == static_cast(ColMajor) + ? i : NumOutputDims - i - 1; + output_inner_dim_size *= output_block_sizes[dim]; + if (output_block_sizes[dim] < m_dimensions[dim]) { + output_outer_dim_start = i + 1; + break; + } + } + + // Initialize output block iterator state. + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + array block_iter_state; + + for (Index i = 0; i < NumOutputDims; ++i) { + const Index dim = static_cast(Layout) == static_cast(ColMajor) + ? i : NumOutputDims - i - 1; + block_iter_state[i].size = output_block_sizes[dim]; + block_iter_state[i].stride = m_outputStrides[dim]; + block_iter_state[i].span = + block_iter_state[i].stride * (block_iter_state[i].size - 1); + block_iter_state[i].count = 0; + } + + const Index output_outer_dim_size = output_block_sizes.TotalSize() / + output_inner_dim_size; + const typename TensorEvaluator::Dimensions& input_dims = + m_impl.dimensions(); + + Index index = output_block->first_coeff_index(); + for (Index outer_idx = 0; outer_idx < output_outer_dim_size; ++outer_idx) { + Index inner_idx = 0; + while (inner_idx < output_inner_dim_size) { + // Calculate input coords based on 'index'. + array input_coords; + Index idx = index; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumInputDims - 1; i > 0; --i) { + input_coords[i] = idx / m_inputStrides[i]; + idx -= input_coords[i] * m_inputStrides[i]; + } + input_coords[0] = idx; + } else { + for (int i = 0; i < NumInputDims - 1; ++i) { + input_coords[i] = idx / m_inputStrides[i]; + idx -= input_coords[i] * m_inputStrides[i]; + } + input_coords[NumInputDims - 1] = idx; + } + + // Calculate target input block shape, using at most + // 'output_inner_dim_size' coefficients along the input block's inner + // dimensions. + DSizes input_block_sizes; + Index num_to_allocate = output_inner_dim_size - inner_idx; + for (Index i = 0; i < NumInputDims; ++i) { + const Index dim = + static_cast(Layout) == static_cast(ColMajor) + ? i : NumInputDims - i - 1; + input_block_sizes[dim] = numext::mini( + num_to_allocate, (static_cast(input_dims[dim]) - + input_coords[dim])); + if (input_coords[dim] == 0) { + num_to_allocate /= input_block_sizes[dim]; + } else { + num_to_allocate = 1; + } + } + + // Calculate input block strides. + DSizes input_block_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_block_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_block_strides[i] = input_block_strides[i - 1] * + input_block_sizes[i - 1]; + } + } else { + input_block_strides[NumInputDims - 1] = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_block_strides[i] = input_block_strides[i + 1] * + input_block_sizes[i + 1]; + } + } + + // Instantiate and read input block from input tensor. + InputTensorBlock input_block(index, input_block_sizes, + input_block_strides, m_inputStrides, + output_block->data() + outer_idx * + output_inner_dim_size + inner_idx); + + m_impl.block(&input_block); + + const Index input_block_total_size = input_block_sizes.TotalSize(); + index += input_block_total_size; + inner_idx += input_block_total_size; + } + eigen_assert(inner_idx == output_inner_dim_size); + index -= output_inner_dim_size; + // Update index. + for (Index i = output_outer_dim_start; i < NumOutputDims; ++i) { + if (++block_iter_state[i].count < block_iter_state[i].size) { + index += block_iter_state[i].stride; + break; + } + block_iter_state[i].count = 0; + index -= block_iter_state[i].span; + } + } + } + EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return const_cast(m_impl.data()); } EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } @@ -155,6 +326,8 @@ struct TensorEvaluator, Device> protected: TensorEvaluator m_impl; NewDimensions m_dimensions; + DSizes m_outputStrides; + DSizes m_inputStrides; }; @@ -322,17 +495,27 @@ struct TensorEvaluator, Devi typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets and sizes. - IsAligned = /*TensorEvaluator::IsAligned*/false, + IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { @@ -340,6 +523,16 @@ struct TensorEvaluator, Devi eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); } + m_is_identity = true; + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= + op.sizes()[i] + op.startIndices()[i]); + if (m_impl.dimensions()[i] != op.sizes()[i] || + op.startIndices()[i] != 0) { + m_is_identity = false; + } + } + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Sizes& output_dims = op.sizes(); if (static_cast(Layout) == static_cast(ColMajor)) { @@ -367,13 +560,10 @@ struct TensorEvaluator, Devi m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } - } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -417,7 +607,11 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_impl.coeff(srcCoeff(index)); + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } } template @@ -427,6 +621,10 @@ struct TensorEvaluator, Devi EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); + if (m_is_identity) { + return m_impl.template packet(index); + } + Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -469,9 +667,26 @@ struct TensorEvaluator, Devi } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + output_block->block_sizes(), + output_block->block_strides(), + Dimensions(m_inputStrides), + output_block->data()); + m_impl.block(&input_block); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Eigen::internal::traits::PointerType data() const { Scalar* result = m_impl.data(); @@ -544,7 +759,9 @@ struct TensorEvaluator, Devi TensorEvaluator m_impl; const Device& m_device; Dimensions m_dimensions; + bool m_is_identity; const StartIndices m_offsets; + Index m_block_total_size_max; }; @@ -557,33 +774,46 @@ struct TensorEvaluator, Device> typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, + IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess }; + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - return this->m_impl.coeffRef(this->srcCoeff(index)); + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { + if (this->m_is_identity) { + this->m_impl.template writePacket(index, x); + return; + } + const int packetSize = internal::unpacket_traits::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; @@ -623,6 +853,14 @@ struct TensorEvaluator, Device> } } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + this->m_impl.writeBlock(TensorBlock( + this->srcCoeff(block.first_coeff_index()), block.block_sizes(), + block.block_strides(), Dimensions(this->m_inputStrides), + const_cast(block.data()))); + } }; @@ -739,7 +977,13 @@ struct TensorEvaluator startIndicesClamped, stopIndicesClamped; + m_is_identity = true; for (size_t i = 0; i < internal::array_size::value; ++i) { + if (m_strides[i] != 1 || op.startIndices()[i] != 0 || + op.stopIndices()[i] != (m_impl.dimensions()[i] - 1)) { + m_is_identity = false; + } + eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); if(m_strides[i]>0){ startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); @@ -822,11 +1066,15 @@ struct TensorEvaluator::PointerType data() const { @@ -873,6 +1121,7 @@ struct TensorEvaluator m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; + bool m_is_identity; TensorEvaluator m_impl; const Device& m_device; DSizes m_startIndices; // clamped startIndices @@ -916,7 +1165,11 @@ struct TensorEvaluatorm_impl.coeffRef(this->srcCoeff(index)); + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 375fc0802..05c0990dc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -356,6 +356,70 @@ template __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif +template +class BlockReducer { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + explicit BlockReducer(const Op& reducer) : op_(reducer) { + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + for (Index i = 0; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { return op_.finalize(accum_); } + PacketReturnType FinalizePacket() { + // TODO(andydavis) This function should not be called for Scalar + // reductions: clean this up or add an assert here. + return PacketReturnType(); + } + + private: + CoeffReturnType accum_; + Op op_; +}; + +template +class BlockReducer { + public: + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::PacketReturnType PacketReturnType; + static const Index PacketSize = + internal::unpacket_traits::size; + + explicit BlockReducer(const Op& reducer) : op_(reducer) { + vaccum_ = op_.template initializePacket(); + accum_ = op_.initialize(); + } + void Reduce(Index index, Index num_values_to_reduce, Scalar* data) { + const Index vectorized_size = + (num_values_to_reduce / PacketSize) * PacketSize; + for (Index i = 0; i < vectorized_size; i += PacketSize) { + op_.reducePacket( + internal::ploadt(&data[index + i]), + &vaccum_); + } + for (Index i = vectorized_size; i < num_values_to_reduce; ++i) { + op_.reduce(data[index + i], &accum_); + } + } + CoeffReturnType Finalize() { return op_.finalizeBoth(accum_, vaccum_); } + PacketReturnType FinalizePacket() { return op_.finalizePacket(vaccum_); } + + private: + PacketReturnType vaccum_; + CoeffReturnType accum_; + Op op_; +}; + } // end namespace internal @@ -394,6 +458,7 @@ class TensorReductionOp : public TensorBase class MakePointer_, typename Device> struct TensorEvaluator, Device> { + typedef internal::reducer_traits ReducerTraits; typedef TensorReductionOp XprType; typedef typename XprType::Index Index; typedef ArgType ChildType; @@ -410,14 +475,19 @@ struct TensorEvaluator, static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = Self::InputPacketAccess && Op::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using OutputTensorBlock = internal::TensorBlock; + using InputTensorBlock = internal::TensorBlock; + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; static const bool RunningFullReduction = (NumOutputDims==0); @@ -451,11 +521,13 @@ struct TensorEvaluator, m_outputStrides[0] = 1; for (int i = 1; i < NumOutputDims; ++i) { m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } else { - m_outputStrides.back() = 1; + m_outputStrides[NumOutputDims - 1] = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } } @@ -483,6 +555,7 @@ struct TensorEvaluator, ++reduceIndex; } else { m_preservedStrides[outputIndex] = input_strides[i]; + m_output_to_input_dim_map[outputIndex] = i; ++outputIndex; } } @@ -492,6 +565,16 @@ struct TensorEvaluator, if (NumOutputDims == 0) { m_preservedStrides[0] = internal::array_prod(input_dims); } + + m_numValuesToReduce = + NumOutputDims == 0 + ? internal::array_prod(input_dims) + : (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumOutputDims - 1]; + + m_block_total_size_max = + numext::maxi(1, device.lastLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -686,6 +769,265 @@ struct TensorEvaluator, } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kSkewedInnerDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void block( + OutputTensorBlock* output_block) const { + // Special case full reductions to avoid input block copy below. + if (NumInputDims == NumReducedDims) { + eigen_assert(output_block->first_coeff_index() == 0); + eigen_assert(output_block->block_sizes().TotalSize() == 1); + Op reducer(m_reducer); + output_block->data()[0] = internal::InnerMostDimReducer::reduce( + *this, 0, m_numValuesToReduce, reducer); + return; + } + + // Calculate input tensor 'slice' required to reduce output block coeffs. + DSizes input_slice_sizes(m_impl.dimensions()); + for (int i = 0; i < NumOutputDims; ++i) { + // Clip preserved input dimensions by output block size. + input_slice_sizes[m_output_to_input_dim_map[i]] = + output_block->block_sizes()[i]; + } + + // Shard input tensor slice into blocks (because it could be large if we + // need to reduce along several dimensions to calculate required output + // coefficients). + const Index max_coeff_count = + numext::mini(((m_device.firstLevelCacheSize()) / sizeof(Scalar)), + input_slice_sizes.TotalSize()); + + // Calculate max output shard size needed to keep working set of reducers + // in L1, while leaving enough space for reducer overhead and 'PacketSize' + // reductions. + DSizes target_input_block_sizes; + CalculateTargetInputBlockShape(max_coeff_count, input_slice_sizes, + &target_input_block_sizes); + // Calculate indices for first preserved dimension. + const Index first_preserved_dim_output_index = + static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumOutputDims - 1; + const Index first_preserved_dim_input_index = + m_output_to_input_dim_map[first_preserved_dim_output_index]; + const bool inner_most_dim_preserved = + first_preserved_dim_input_index == + (static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumInputDims - 1) | + PreservingInnerMostDims; + + // Calculate output block inner/outer dimension sizes. + const Index output_block_inner_dim_size = + output_block->block_sizes()[first_preserved_dim_output_index]; + const Index output_block_outer_dim_size = + output_block->block_sizes().TotalSize() / output_block_inner_dim_size; + // Calculate shard size for first preserved dimension. + const Index output_shard_size = + target_input_block_sizes[first_preserved_dim_input_index]; + const Index num_output_shards = + (output_block_inner_dim_size + output_shard_size - 1) / + output_shard_size; + + // Initialize 'tensor_slice_offsets' from input coords of output index. + DSizes tensor_slice_offsets; + GetInputCoordsForOutputIndex(output_block->first_coeff_index(), + &tensor_slice_offsets); + + // Store tensor slice offset in first preserved dimension to be used + // to update tensor slice extents in loop below. + const Index first_preserved_dim_offset_start = + tensor_slice_offsets[first_preserved_dim_input_index]; + + array block_iter_state; + + // Initialize state used to iterate through output coefficients + // and update 'tensor_slice_offsets' in outer preserved dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + 1 + : NumOutputDims - i - 2; + block_iter_state[i].input_dim = m_output_to_input_dim_map[dim]; + block_iter_state[i].output_size = output_block->block_sizes()[dim]; + block_iter_state[i].output_count = 0; + } + + // Allocate input block memory. + ScalarNoConst* input_block_data = static_cast( + m_device.allocate(max_coeff_count * sizeof(Scalar))); + // Allocate reducer memory. + const bool packet_reductions_enabled = + (Self::InputPacketAccess & Self::ReducerTraits::PacketAccess); + const Index num_reducers = + (inner_most_dim_preserved && packet_reductions_enabled) + ? (output_shard_size / PacketSize + output_shard_size % PacketSize + + PacketSize) + : output_shard_size; + typedef internal::BlockReducer BlockReducer; + BlockReducer* reducers = static_cast( + m_device.allocate(num_reducers * sizeof(BlockReducer))); + + InputDimensions input_tensor_dims(m_impl.dimensions()); + for (Index output_outer_index = 0; + output_outer_index < output_block_outer_dim_size; + ++output_outer_index) { + for (Index output_shard_index = 0; output_shard_index < num_output_shards; + ++output_shard_index) { + // Initialize 'tensor_slice_extents' for this output shard. + DSizes tensor_slice_extents(input_slice_sizes); + for (int i = 0; i < NumInputDims; ++i) { + if (i == first_preserved_dim_input_index) { + // Clip first preserved dim size to output shard size. + tensor_slice_extents[i] = numext::mini( + output_shard_size, + input_slice_sizes[i] - (tensor_slice_offsets[i] - + first_preserved_dim_offset_start)); + + } else if (!m_reduced[i]) { + // Clip outer preserved dims to size 1, so that we reduce a + // contiguous set of output coefficients. + tensor_slice_extents[i] = 1; + } + } + + // Intialize output coefficient reducers. + for (int i = 0; i < num_reducers; ++i) { + new (&reducers[i]) BlockReducer(m_reducer); + } + + using TensorSliceBlockMapper = + internal::TensorSliceBlockMapper; + + // TODO(andydavis) Consider removing 'input_block_stride_order' if we + // find that scattered reads are not worth supporting in + // TensorSliceBlockMapper. + TensorSliceBlockMapper block_mapper( + input_tensor_dims, tensor_slice_offsets, tensor_slice_extents, + target_input_block_sizes, DimensionList()); + + const Index num_outputs_to_update = + tensor_slice_extents[first_preserved_dim_input_index]; + const Index preserved_dim_vector_reducer_count = + (inner_most_dim_preserved && packet_reductions_enabled) + ? num_outputs_to_update / PacketSize + : 0; + const Index preserved_dim_vector_coeff_count = + inner_most_dim_preserved + ? preserved_dim_vector_reducer_count * PacketSize + : 0; + const Index preserved_dim_reducer_limit = + (inner_most_dim_preserved && packet_reductions_enabled) + ? (preserved_dim_vector_reducer_count + + num_outputs_to_update % PacketSize) + : num_outputs_to_update; + + const Index total_block_count = block_mapper.total_block_count(); + for (Index b = 0; b < total_block_count; ++b) { + InputTensorBlock input_block = + block_mapper.GetBlockForIndex(b, input_block_data); + // Read. + m_impl.block(&input_block); + + Index num_values_to_reduce = 1; + for (Index i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + num_values_to_reduce *= input_block.block_sizes()[i]; + } + } + // Reduce. + if (inner_most_dim_preserved) { + const Index input_outer_dim_size = + input_block.block_sizes().TotalSize() / num_outputs_to_update; + for (Index input_outer_dim_index = 0; + input_outer_dim_index < input_outer_dim_size; + ++input_outer_dim_index) { + const Index input_outer_dim_base = + input_outer_dim_index * num_outputs_to_update; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + reducers[i].Reduce(input_outer_dim_base + i * PacketSize, + PacketSize, input_block.data()); + } + const Index scalar_reducer_base = + input_outer_dim_base + preserved_dim_vector_coeff_count; + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + reducers[i].Reduce(scalar_reducer_base + i - + preserved_dim_vector_reducer_count, + 1, input_block.data()); + } + } + } else { + for (Index i = 0; i < num_outputs_to_update; ++i) { + reducers[i].Reduce(i * num_values_to_reduce, num_values_to_reduce, + input_block.data()); + } + } + } + + // Finalize all reducers for this output shard. + const Index output_base_index = + output_outer_index * output_block_inner_dim_size + + output_shard_index * output_shard_size; + if (inner_most_dim_preserved) { + EIGEN_ALIGN_MAX + typename internal::remove_const::type + values[PacketSize]; + for (Index i = 0; i < preserved_dim_vector_reducer_count; ++i) { + const Index reducer_base = output_base_index + i * PacketSize; + internal::pstore( + values, reducers[i].FinalizePacket()); + for (Index j = 0; j < PacketSize; ++j) { + output_block->data()[reducer_base + j] = values[j]; + } + } + const Index scalar_reducer_base = + output_base_index + preserved_dim_vector_coeff_count; + + for (Index i = preserved_dim_vector_reducer_count; + i < preserved_dim_reducer_limit; ++i) { + output_block->data()[scalar_reducer_base + i - + preserved_dim_vector_reducer_count] = + reducers[i].Finalize(); + } + } else { + for (int i = 0; i < num_outputs_to_update; ++i) { + output_block->data()[output_base_index + i] = + reducers[i].Finalize(); + } + } + + // Update 'tensor_slice_offsets' by num outputs for this output shard. + tensor_slice_offsets[first_preserved_dim_input_index] += + num_outputs_to_update; + } + // Update slice offset for inner preserved dim. + tensor_slice_offsets[first_preserved_dim_input_index] -= + output_block_inner_dim_size; + // Update slice offsets for remaining output dims. + for (int i = 0; i < NumOutputDims - 1; ++i) { + BlockIteratorState& b = block_iter_state[i]; + if (++b.output_count < b.output_size) { + ++tensor_slice_offsets[b.input_dim]; + break; + } + b.output_count = 0; + tensor_slice_offsets[b.input_dim] -= b.output_size - 1; + } + } + + // Free memory. + m_device.deallocate(input_block_data); + m_device.deallocate(reducers); + } + EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } #if defined(EIGEN_USE_SYCL) @@ -722,6 +1064,12 @@ struct TensorEvaluator, template friend struct internal::InnerReducer; + struct BlockIteratorState { + Index input_dim; + Index output_size; + Index output_count; + }; + // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -764,16 +1112,90 @@ struct TensorEvaluator, return startInput; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void GetInputCoordsForOutputIndex( + Index index, + DSizes* coords) const { + for (int i = 0; i < NumInputDims; ++i) { + (*coords)[i] = 0; + } + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[0]] = index; + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + (*coords)[m_output_to_input_dim_map[i]] = idx; + index -= idx * m_outputStrides[i]; + } + (*coords)[m_output_to_input_dim_map[NumOutputDims-1]] = index; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void CalculateTargetInputBlockShape( + const Index max_coeff_count, + const DSizes& input_slice_sizes, + DSizes* target_input_block_sizes) const { + typedef typename internal::packet_traits::type Packet; + typedef internal::BlockReducer BlockReducer; + // TODO(andydavis) Compute reducer overhead correctly for the case where + // we are preserving the inner most dimension, and a single reducer + // reduces a packet's worth of output coefficients. + const Index reducer_overhead = sizeof(BlockReducer) / sizeof(Scalar); + + Index coeff_to_allocate = max_coeff_count; + bool first_preserved_dim_allocated = false; + bool first_reduced_dim_allocated = false; + for (int i = 0; i < NumInputDims; ++i) { + const int dim = static_cast(Layout) == static_cast(ColMajor) + ? i + : NumInputDims - i - 1; + (*target_input_block_sizes)[dim] = 1; + if (m_reduced[dim]) { + // TODO(andydavis) Consider allocating to multiple reduced dimensions. + // Watch out for cases where reduced dimensions are not contiguous, + // which induces scattered reads. + if (!first_reduced_dim_allocated) { + (*target_input_block_sizes)[dim] = + numext::mini(input_slice_sizes[dim], coeff_to_allocate); + coeff_to_allocate /= (*target_input_block_sizes)[dim]; + first_reduced_dim_allocated = true; + } + } else if (!first_preserved_dim_allocated) { + // TODO(andydavis) Include output block size in this L1 working set + // calculation. + const Index allocated = max_coeff_count - coeff_to_allocate; + const Index alloc_size = numext::maxi( + static_cast(1), coeff_to_allocate / reducer_overhead); + (*target_input_block_sizes)[dim] = + numext::mini(input_slice_sizes[dim], alloc_size); + coeff_to_allocate = numext::maxi( + static_cast(1), + coeff_to_allocate / + ((*target_input_block_sizes)[dim] * reducer_overhead)); + first_preserved_dim_allocated = true; + } + } + } + // Bitmap indicating if an input dimension is reduced or not. array m_reduced; // Dimensions of the output of the operation. Dimensions m_dimensions; // Precomputed strides for the output tensor. array m_outputStrides; + array, NumOutputDims> m_fastOutputStrides; // Subset of strides of the input tensor for the non-reduced dimensions. // Indexed by output dimensions. static const int NumPreservedStrides = max_n_1::size; array m_preservedStrides; + // Map from output to input dimension index. + array m_output_to_input_dim_map; + // How many values go into each reduction + Index m_numValuesToReduce; // Subset of strides of the input tensor for the reduced dimensions. // Indexed by reduced dimensions. @@ -782,6 +1204,9 @@ struct TensorEvaluator, // Indexed by reduced dimensions. array m_reducedDims; + // Block size for tiled (aka TensorBlock) evaluation. + Index m_block_total_size_max; + // Evaluator for the input expression. TensorEvaluator m_impl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 6b54f40ad..3add9dac0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -100,6 +100,7 @@ class TensorShufflingOp : public TensorBase template struct TensorEvaluator, Device> { + typedef TensorEvaluator, Device> Self; typedef TensorShufflingOp XprType; typedef typename XprType::Index Index; static const int NumDims = internal::array_size::Dimensions>::value; @@ -110,44 +111,60 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + using TensorBlockReader = internal::TensorBlockReader; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation()) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Shuffle& shuffle = op.shufflePermutation(); + m_is_identity = true; for (int i = 0; i < NumDims; ++i) { m_dimensions[i] = input_dims[shuffle[i]]; + m_inverseShuffle[shuffle[i]] = i; + if (m_is_identity && shuffle[i] != i) { + m_is_identity = false; + } } - array inputStrides; - if (static_cast(Layout) == static_cast(ColMajor)) { - inputStrides[0] = 1; + m_unshuffledInputStrides[0] = 1; m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { - inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1]; + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } else { - inputStrides[NumDims - 1] = 1; + m_unshuffledInputStrides[NumDims - 1] = 1; m_outputStrides[NumDims - 1] = 1; for (int i = NumDims - 2; i >= 0; --i) { - inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] = inputStrides[shuffle[i]]; + m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; } + + m_block_total_size_max = + numext::maxi(1, device.firstLevelCacheSize() / sizeof(Scalar)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -162,29 +179,151 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_impl.coeff(srcCoeff(index)); + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } } + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static PacketReturnType Run(const Self& self, Index index) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + }; + + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static PacketReturnType Run(const Self& self, Index index) { + if (self.m_is_identity) { + return self.m_impl.template packet(index); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + }; + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + return PacketLoader::PacketAccess>::Run(*this, index); + } - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector* resources) const { + resources->push_back(internal::TensorOpResourceRequirements( + internal::TensorBlockShapeType::kUniformAllDims, + m_block_total_size_max)); + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block( + TensorBlock* output_block) const { + if (m_impl.data() != NULL) { + // Fast path: we have direct access to the data, so shuffle as we read. + TensorBlockReader::Run(output_block, + srcCoeff(output_block->first_coeff_index()), + m_inverseShuffle, + m_unshuffledInputStrides, + m_impl.data()); + return; + } + + // Slow path: read unshuffled block from the input and shuffle in-place. + // Initialize input block sizes using input-to-output shuffle map. + DSizes input_block_sizes; + for (Index i = 0; i < NumDims; ++i) { + input_block_sizes[i] = output_block->block_sizes()[m_inverseShuffle[i]]; + } + + // Calculate input block strides. + DSizes input_block_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_block_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + input_block_strides[i] = + input_block_strides[i - 1] * input_block_sizes[i - 1]; + } + } else { + input_block_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + input_block_strides[i] = + input_block_strides[i + 1] * input_block_sizes[i + 1]; + } + } + + // Read input block. + TensorBlock input_block(srcCoeff(output_block->first_coeff_index()), + input_block_sizes, + input_block_strides, + Dimensions(m_unshuffledInputStrides), + output_block->data()); + + m_impl.block(&input_block); + + // Naive In-place shuffle: random IO but block size is O(L1 cache size). + // TODO(andydavis) Improve the performance of this in-place shuffle. + const Index total_size = input_block_sizes.TotalSize(); + std::vector bitmap(total_size, false); + ScalarNoConst* data = const_cast(output_block->data()); + const DSizes& output_block_strides = + output_block->block_strides(); + for (Index input_index = 0; input_index < total_size; ++input_index) { + if (bitmap[input_index]) { + // Coefficient at this index has already been shuffled. + continue; + } + + Index output_index = GetBlockOutputIndex(input_index, input_block_strides, + output_block_strides); + if (output_index == input_index) { + // Coefficient already in place. + bitmap[output_index] = true; + continue; + } + + // The following loop starts at 'input_index', and shuffles + // coefficients into their shuffled location at 'output_index'. + // It skips through the array shuffling coefficients by following + // the shuffle cycle starting and ending a 'start_index'. + ScalarNoConst evicted_value; + ScalarNoConst shuffled_value = data[input_index]; + do { + evicted_value = data[output_index]; + data[output_index] = shuffled_value; + shuffled_value = evicted_value; + bitmap[output_index] = true; + output_index = GetBlockOutputIndex(output_index, input_block_strides, + output_block_strides); + } while (output_index != input_index); + + data[output_index] = shuffled_value; + bitmap[output_index] = true; } - PacketReturnType rslt = internal::pload(values); - return rslt; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + const double compute_cost = m_is_identity ? TensorOpCost::AddCost() : + NumDims * (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize); } EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } @@ -195,27 +334,57 @@ struct TensorEvaluator, Device> EIGEN_STRONG_INLINE const TensorEvaluator& impl() const {return m_impl;} protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( + Index input_index, + const DSizes& input_block_strides, + const DSizes& output_block_strides) const { + Index output_index = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = input_index / input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = input_index / input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[NumDims - 1]]; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } return inputIndex + index * m_inputStrides[0]; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; + const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i]; index -= idx * m_outputStrides[i]; } return inputIndex + index * m_inputStrides[NumDims - 1]; } } + Dimensions m_dimensions; + bool m_is_identity; + array m_inverseShuffle; array m_outputStrides; + array, NumDims> m_fastOutputStrides; array m_inputStrides; + array m_unshuffledInputStrides; + Index m_block_total_size_max; TensorEvaluator m_impl; /// required by sycl Shuffle m_shuffle; @@ -239,12 +408,18 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = false, PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, - RawAccess = false + BlockAccess = TensorEvaluator::BlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; + using ScalarNoConst = typename internal::remove_const::type; + + using TensorBlock = internal::TensorBlock; + using TensorBlockWriter = internal::TensorBlockWriter; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -265,6 +440,14 @@ struct TensorEvaluator, Device> this->coeffRef(index+i) = values[i]; } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + TensorBlockWriter::Run(block, this->srcCoeff(block.first_coeff_index()), + this->m_inverseShuffle, + this->m_unshuffledInputStrides, this->m_impl.data()); + } }; -- cgit v1.2.3