diff options
author | Eugene Zhulenev <ezhulenev@google.com> | 2019-10-02 12:44:06 -0700 |
---|---|---|
committer | Eugene Zhulenev <ezhulenev@google.com> | 2019-10-02 12:44:06 -0700 |
commit | 60ae24ee1a6c16114de456d77fcfba6f5a1160ca (patch) | |
tree | 7b9d5463018055571a5050ca31a8d3df12a3e6fc /unsupported/Eigen/CXX11/src | |
parent | 6e40454a6e6cc57c07c7340148657c985ca6c928 (diff) |
Add block evaluation to TensorReshaping/TensorCasting/TensorPadding/TensorSelect
Diffstat (limited to 'unsupported/Eigen/CXX11/src')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h | 275 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 8 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h | 55 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 138 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h | 11 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 95 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 254 |
7 files changed, 692 insertions, 144 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h index 25047b8e5..4d2145bf3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h @@ -12,13 +12,18 @@ namespace Eigen { namespace internal { // -------------------------------------------------------------------------- // +// Forward declarations for templates defined below. +template <typename Scalar, typename IndexType, int NumDims, int Layout> +class TensorBlockIOV2; + +// -------------------------------------------------------------------------- // // Helper function to compute strides for densely stored buffer of given // dimensions. // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use // this function instead everywhere. template <int Layout, typename IndexType, int NumDims> -EIGEN_STRONG_INLINE DSizes<IndexType, NumDims> strides( +EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides( const DSizes<IndexType, NumDims>& dimensions) { DSizes<IndexType, NumDims> strides; if (NumDims == 0) return strides; @@ -40,6 +45,14 @@ EIGEN_STRONG_INLINE DSizes<IndexType, NumDims> strides( return strides; } +#if EIGEN_HAS_CXX11 +template <int Layout, std::ptrdiff_t... Indices> +EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides( + const Sizes<Indices...>& sizes) { + return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes)); +} +#endif + // -------------------------------------------------------------------------- // // TensorBlockDescriptor specifies a block offset within a tensor and the block // sizes along each of the tensor dimensions. @@ -155,6 +168,14 @@ class TensorBlockDescriptor { DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes); } + template <typename Scalar, typename DstStridesIndexType> + void AddDestinationBuffer( + Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides, + size_t total_dst_bytes) { + // DSizes constructor will do index type promotion if it's safe. + AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes); + } + TensorBlockDescriptor& DropDestinationBuffer() { m_destination.m_data = NULL; return *this; @@ -333,10 +354,11 @@ class TensorMaterializedBlock { typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; #endif public: + typedef DSizes<IndexType, NumDims> Dimensions; typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType; TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, - const DSizes<IndexType, NumDims>& dimensions) + const Dimensions& dimensions) : m_kind(kind), m_data(data), m_dimensions(dimensions), @@ -352,18 +374,84 @@ class TensorMaterializedBlock { // properly for TensorMap. const XprType& expr() const { return m_expr; } const Scalar* data() const { return m_data; } - void cleanup() {} + typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc; + + // Creates a materialized block for the given descriptor from a memory buffer. + template <typename DataDimensions, typename TensorBlockScratch> + EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( + const Scalar* data, const DataDimensions& data_dims, + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size()); + + // If a tensor block dimensions covers a contiguous block of the underlying + // memory, we can skip block buffer memory allocation, and construct a block + // from existing `data` memory buffer. + // + // Example: (RowMajor layout) + // data_dims: [11, 12, 13, 14] + // desc.dimensions(): [1, 1, 3, 14] + // + // In this case we can construct a TensorBlock starting at + // `data + desc.offset()`, with a `desc.dimensions()` block sizes. + static const bool is_col_major = Layout == ColMajor; + + // Find out how many inner dimensions have a matching size. + int num_matching_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (data_dims[dim] != desc.dimensions()[dim]) break; + ++num_matching_inner_dims; + } + + // All the outer dimensions must be of size `1`, except a single dimension + // before the matching inner dimension (`3` in the example above). + bool can_use_direct_access = true; + for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (desc.dimension(dim) != 1) { + can_use_direct_access = false; + break; + } + } + + if (can_use_direct_access) { + const Scalar* block_start = data + desc.offset(); + return TensorMaterializedBlock(TensorBlockKind::kView, block_start, + desc.dimensions()); + + } else { + void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); + Scalar* block_buffer = static_cast<Scalar*>(mem); + + typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout> + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)), + data, desc.offset()); + TensorBlockIODst dst(desc.dimensions(), + internal::strides<Layout>(desc.dimensions()), + block_buffer); + + TensorBlockIO::Copy(dst, src); + + return TensorMaterializedBlock(TensorBlockKind::kMaterializedInScratch, + block_buffer, desc.dimensions()); + } + } + private: TensorBlockKind m_kind; const Scalar* m_data; - DSizes<IndexType, NumDims> m_dimensions; + Dimensions m_dimensions; XprType m_expr; }; // -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression that applies UnaryOp +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp // functor to the blocks produced by the underlying Tensor expression. template <typename UnaryOp, typename ArgTensorBlock> @@ -398,7 +486,7 @@ class TensorCwiseUnaryBlock { }; // -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression that applies BinaryOp +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp // functor to the blocks produced by the underlying Tensor expression. template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock> @@ -447,6 +535,96 @@ class TensorCwiseBinaryBlock { }; // -------------------------------------------------------------------------- // +// TensorUnaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from a block of the underlying type (this is a +// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). + +template <typename BlockFactory, typename ArgTensorBlock> +class TensorUnaryExprBlock { +#if !EIGEN_HAS_CXX11 + typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; +#endif + + typedef typename ArgTensorBlock::XprType ArgXprType; + static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType<ArgXprType>::type>::type XprType; + + typedef typename XprScalar<XprType>::type Scalar; + + TensorUnaryExprBlock(const ArgTensorBlock& arg_block, + const BlockFactory& factory) + : m_arg_block(arg_block), m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { return m_factory.expr(m_arg_block.expr()); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// TensorTernaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from three blocks of the underlying type. + +template <typename BlockFactory, typename Arg1TensorBlock, + typename Arg2TensorBlock, typename Arg3TensorBlock> +class TensorTernaryExprBlock { +#if !EIGEN_HAS_CXX11 + typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind; +#endif + + typedef typename Arg1TensorBlock::XprType Arg1XprType; + typedef typename Arg2TensorBlock::XprType Arg2XprType; + typedef typename Arg3TensorBlock::XprType Arg3XprType; + + static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value || + internal::is_void<Arg2XprType>::value || + internal::is_void<Arg3XprType>::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType<Arg1XprType, Arg2XprType, + Arg3XprType>::type>::type XprType; + + typedef typename XprScalar<XprType>::type Scalar; + + TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, + const Arg2TensorBlock& arg2_block, + const Arg3TensorBlock& arg3_block, + const BlockFactory& factory) + : m_arg1_block(arg1_block), + m_arg2_block(arg2_block), + m_arg3_block(arg3_block), + m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { + return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), + m_arg3_block.expr()); + } + const Scalar* data() const { return NULL; } + void cleanup() { + m_arg1_block.cleanup(); + m_arg2_block.cleanup(); + m_arg3_block.cleanup(); + } + + private: + Arg1TensorBlock m_arg1_block; + Arg2TensorBlock m_arg2_block; + Arg3TensorBlock m_arg3_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // // StridedLinearBufferCopy provides a method to copy data between two linear // buffers with different strides, with optimized paths for scatter/gather. @@ -547,7 +725,13 @@ class StridedLinearBufferCopy { } else if (kind == FillLinear) { // Fill `dst` with value at `*src`. eigen_assert(src_stride == 0 && dst_stride == 1); + const IndexType unrolled_size = count - 4 * PacketSize; Packet p = pload1<Packet>(src); + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p); + } + } for (; i <= vectorized_size; i += PacketSize) { pstoreu<Scalar, Packet>(dst + i, p); } @@ -809,15 +993,15 @@ class TensorBlockIOV2 { // -------------------------------------------------------------------------- // // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to -// a Tensor block defined by `desc`, backed by a memory buffer at `dst` address. +// a Tensor block defined by `desc`, backed by a memory buffer at `target`. // // Currently there is no way to write from a Tensor expression to a block of // memory, if dimensions are reordered. If you need to do that, you should // materialize a Tensor block expression into a memory buffer, and then use // TensorBlockIO to copy data between two memory buffers with a custom -// `dst->src` dimension map (see definition above). +// `target->src` dimension map (see definition above). // -// Also currently the innermost dimension of `dst` must have a stride '1' +// Also currently the innermost dimension of `target` must have a stride '1' // (contiguous in memory). This restriction could be lifted with a `pscatter`, // but in practice it's never needed, and there is a similar TensorBlockIO // workaround for that. @@ -842,18 +1026,18 @@ class TensorBlockAssignment { template <bool Vectorizable, typename Evaluator> struct InnerDimAssign { - EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count, + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { for (IndexType i = 0; i < count; ++i) { - dst[i] = eval.coeff(eval_offset + i); + target[i] = eval.coeff(eval_offset + i); } } }; template <typename Evaluator> struct InnerDimAssign<true, Evaluator> { - EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count, + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) { typedef typename packet_traits<Scalar>::type Packet; @@ -866,26 +1050,29 @@ class TensorBlockAssignment { for (int j = 0; j < 4; ++j) { const IndexType idx = eval_offset + i + j * PacketSize; Packet p = eval.template packet<Unaligned>(idx); - pstoreu<Scalar>(dst + i + j * PacketSize, p); + pstoreu<Scalar>(target + i + j * PacketSize, p); } } for (; i <= vectorized_size; i += PacketSize) { Packet p = eval.template packet<Unaligned>(eval_offset + i); - pstoreu<Scalar>(dst + i, p); + pstoreu<Scalar>(target + i, p); } for (; i < count; ++i) { - dst[i] = eval.coeff(eval_offset + i); + target[i] = eval.coeff(eval_offset + i); } } }; public: - struct Dst { - Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, - IndexType dst_offset = 0) - : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} + struct Target { + Target(const Dimensions& target_dims, const Dimensions& target_strides, + Scalar* target_data, IndexType target_offset = 0) + : dims(target_dims), + strides(target_strides), + data(target_data), + offset(target_offset) {} Dimensions dims; Dimensions strides; @@ -893,34 +1080,50 @@ class TensorBlockAssignment { IndexType offset; }; + static Target target(const Dimensions& target_dims, + const Dimensions& target_strides, Scalar* target_data, + IndexType target_offset = 0) { + return Target(target_dims, target_strides, target_data, target_offset); + } + + template <typename TargetDimsIndexType, typename TargetStridesIndexType> + static Target target( + const DSizes<TargetDimsIndexType, NumDims>& target_dims, + const DSizes<TargetStridesIndexType, NumDims>& target_strides, + Scalar* target_data, IndexType target_offset = 0) { + // DSizes constructor will do index type promotion if it's safe. + return Target(Dimensions(target_dims), Dimensions(target_strides), + target_data, target_offset); + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Dst& dst, const TensorBlockExpr& expr) { + const Target& target, const TensorBlockExpr& expr) { // Prepare evaluator for block expression. DefaultDevice default_device; TensorBlockEvaluator eval(expr, default_device); // Tensor block expression dimension should match destination dimensions. - eigen_assert(dimensions_match(dst.dims, eval.dimensions())); + eigen_assert(dimensions_match(target.dims, eval.dimensions())); static const int Layout = TensorBlockEvaluator::Layout; static const bool is_col_major = Layout == ColMajor; // Initialize output inner dimension size based on a layout. - const IndexType output_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); + const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; - IndexType output_inner_dim_size = dst.dims[inner_dim_idx]; + IndexType output_inner_dim_size = target.dims[inner_dim_idx]; - // Dst inner dimension stride must be '1'. - eigen_assert(dst.strides[inner_dim_idx] == 1); + // Target inner dimension stride must be '1'. + eigen_assert(target.strides[inner_dim_idx] == 1); - // Squeeze multiple inner dims into one if they are contiguous in `dst`. + // Squeeze multiple inner dims into one if they are contiguous in `target`. IndexType num_squeezed_dims = 0; for (Index i = 1; i < NumDims; ++i) { const Index dim = is_col_major ? i : NumDims - i - 1; - const IndexType dst_stride = dst.strides[dim]; + const IndexType target_stride = target.strides[dim]; - if (output_inner_dim_size == dst_stride) { - output_inner_dim_size *= dst.dims[dim]; + if (output_inner_dim_size == target_stride) { + output_inner_dim_size *= target.dims[dim]; num_squeezed_dims++; } else { break; @@ -936,22 +1139,22 @@ class TensorBlockAssignment { const Index dim = is_col_major ? i + 1 : NumDims - i - 2; it[idx].count = 0; - it[idx].size = dst.dims[dim]; - it[idx].output_stride = dst.strides[dim]; + it[idx].size = target.dims[dim]; + it[idx].output_stride = target.strides[dim]; it[idx].output_span = it[i].output_stride * (it[i].size - 1); idx++; } // We read block expression from the beginning, and start writing data to - // `dst` at given offset. + // `target` at given offset. IndexType input_offset = 0; - IndexType output_offset = dst.offset; + IndexType output_offset = target.offset; - // Iterate copying data from `eval` to `dst`. + // Iterate copying data from `eval` to `target`. for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { - // Assign to `dst` at current offset. + // Assign to `target` at current offset. InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess, - TensorBlockEvaluator>::Run(dst.data + output_offset, + TensorBlockEvaluator>::Run(target.data + output_offset, output_inner_dim_size, eval, input_offset); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 9e4fae99a..dc9551d32 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -1247,10 +1247,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index> TensorBlockAssignment; - typename TensorBlockAssignment::Dst assignment_dst( - input_block_sizes, input_block_strides, *materialized_input); - - TensorBlockAssignment::Run(assignment_dst, input_block.expr()); + TensorBlockAssignment::Run( + TensorBlockAssignment::target(input_block_sizes, input_block_strides, + *materialized_input), + input_block.expr()); input_buffer = *materialized_input; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index a8160e17e..cc3e67677 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -294,23 +294,45 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = + IsAligned = false, + PacketAccess = #ifndef EIGEN_USE_SYCL - true, + true, #else - TensorEvaluator<ArgType, Device>::PacketAccess & - internal::type_casting_traits<SrcType, TargetType>::VectorizedCast, + TensorEvaluator<ArgType, Device>::PacketAccess & + internal::type_casting_traits<SrcType, TargetType>::VectorizedCast, #endif - BlockAccess = false, - BlockAccessV2 = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - RawAccess = false + BlockAccess = false, + BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2, + PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + RawAccess = false }; + static const int NumDims = internal::array_size<Dimensions>::value; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2 + ArgTensorBlock; + + struct TensorConversionOpBlockFactory { + template <typename ArgXprType> + struct XprType { + typedef TensorConversionOp<TargetType, const ArgXprType> type; + }; + + template <typename ArgXprType> + typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const { + return typename XprType<ArgXprType>::type(expr); + } + }; + + typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory, + ArgTensorBlock> + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -376,6 +398,17 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + return TensorBlockV2(m_impl.blockV2(desc, scratch), + TensorConversionOpBlockFactory()); + } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } /// required by sycl in order to extract the sycl accessor diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index c87075a72..b1d668744 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -176,11 +176,12 @@ struct TensorEvaluator typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr, Index> TensorBlockAssign; - typename TensorBlockAssign::Dst dst(desc.dimensions(), - internal::strides<Layout>(m_dims), - m_data, desc.offset()); - TensorBlockAssign::Run(dst, block.expr()); + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), + internal::strides<Layout>(m_dims), m_data, + desc.offset()), + block.expr()); } EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } @@ -349,62 +350,7 @@ struct TensorEvaluator<const Derived, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { assert(m_data != NULL); - - // TODO(ezhulenev): Move it to TensorBlockV2 and reuse in TensorForcedEval. - - // If a tensor block descriptor covers a contiguous block of the underlying - // memory, we can skip block buffer memory allocation, and construct a block - // from existing `m_data` memory buffer. - // - // Example: (RowMajor layout) - // m_dims: [11, 12, 13, 14] - // desc.dimensions(): [1, 1, 3, 14] - // - // In this case we can construct a TensorBlock starting at - // `m_data + desc.offset()`, with a `desc.dimensions()` block sizes. - - static const bool - is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor); - - // Find out how many inner dimensions have a matching size. - int num_matching_inner_dims = 0; - for (int i = 0; i < NumCoords; ++i) { - int dim = is_col_major ? i : NumCoords - i - 1; - if (m_dims[dim] != desc.dimensions()[dim]) break; - ++num_matching_inner_dims; - } - - // All the outer dimensions must be of size `1`, except a single dimension - // before the matching inner dimension (`3` in the example above). - bool can_use_direct_access = true; - for (int i = num_matching_inner_dims + 1; i < NumCoords; ++i) { - int dim = is_col_major ? i : NumCoords - i - 1; - if (desc.dimension(dim) != 1) { - can_use_direct_access = false; - break; - } - } - - if (can_use_direct_access) { - EvaluatorPointerType block_start = m_data + desc.offset(); - return TensorBlockV2(internal::TensorBlockKind::kView, block_start, - desc.dimensions()); - - } else { - void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); - ScalarNoConst* block_buffer = static_cast<ScalarNoConst*>(mem); - - TensorBlockIOSrc src(internal::strides<Layout>(m_dims), m_data, - desc.offset()); - TensorBlockIODst dst(desc.dimensions(), - internal::strides<Layout>(desc.dimensions()), - block_buffer); - - TensorBlockIO::Copy(dst, src); - - return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch, - block_buffer, desc.dimensions()); - } + return TensorBlockV2::materialize(m_data, m_dims, desc, scratch); } EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } @@ -923,15 +869,21 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> typedef typename XprType::Scalar Scalar; enum { - IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess & - PacketType<Scalar, Device>::HasBlend, - BlockAccess = false, - BlockAccessV2 = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<IfArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & + TensorEvaluator<ElseArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & + TensorEvaluator<ElseArgType, Device>::PacketAccess & + PacketType<Scalar, Device>::HasBlend, + BlockAccess = false, + BlockAccessV2 = TensorEvaluator<IfArgType, Device>::BlockAccessV2 && + TensorEvaluator<ThenArgType, Device>::BlockAccessV2 && + TensorEvaluator<ElseArgType, Device>::BlockAccessV2, + PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess || + TensorEvaluator<ThenArgType, Device>::PreferBlockAccess || + TensorEvaluator<ElseArgType, Device>::PreferBlockAccess, + Layout = TensorEvaluator<IfArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -953,8 +905,36 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> typedef StorageMemory<CoeffReturnType, Device> Storage; typedef typename Storage::Type EvaluatorPointerType; + static const int NumDims = internal::array_size<Dimensions>::value; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlockV2 + IfArgTensorBlock; + typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlockV2 + ThenArgTensorBlock; + typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlockV2 + ElseArgTensorBlock; + + struct TensorSelectOpBlockFactory { + template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType> + struct XprType { + typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type; + }; + + template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType> + typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr( + const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const { + return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr); + } + }; + + typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory, + IfArgTensorBlock, ThenArgTensorBlock, + ElseArgTensorBlock> + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -1000,6 +980,24 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + m_condImpl.getResourceRequirements(resources); + m_thenImpl.getResourceRequirements(resources); + m_elseImpl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + // It's unsafe to pass destination buffer to underlying expressions, because + // output might be aliased with one of the inputs. + desc.DropDestinationBuffer(); + + return TensorBlockV2( + m_condImpl.blockV2(desc, scratch), m_thenImpl.blockV2(desc, scratch), + m_elseImpl.blockV2(desc, scratch), TensorSelectOpBlockFactory()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index be8f3a734..2a3398d67 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -324,6 +324,17 @@ struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> { } }; +template <typename FirstType, typename... OtherTypes> +std::ostream& operator<<(std::ostream& os, + const IndexList<FirstType, OtherTypes...>& dims) { + os << "["; + for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} template<typename FirstType, typename... OtherTypes> constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index c8333e488..5d4b0f061 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -113,6 +113,25 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> static const int NumOutputDims = internal::array_size<Dimensions>::value; static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + enum ReshapingKind { + // We do not use layout information to determine reshaping kind. + // Depending on the layout `N` can be inner or outer dimension. + OneByN = 0, // expr.reshape(1, N) + NByOne = 1, // expr.reshape(N, 1) + Runtime = 2 // Reshape dimensions are dynamic (specified at runtime). + }; + + // clang-format off + static const ReshapingKind kind = +#if defined(EIGEN_HAS_INDEX_LIST) + (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN + : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne + : Runtime; +#else + Runtime; +#endif + // clang-format on + enum { IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, @@ -121,8 +140,12 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess && TensorEvaluator<ArgType, Device>::RawAccess && NumInputDims > 0 && NumOutputDims > 0, - BlockAccessV2 = false, - PreferBlockAccess = true, + // For trivial reshapes with raw access to underlying data we will provide + // zero overhead block access. + // TODO(ezhulenev): Consider adding block access without raw access? + BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess && + NumInputDims > 0 && NumOutputDims > 0, + PreferBlockAccess = false, Layout = TensorEvaluator<ArgType, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator<ArgType, Device>::RawAccess @@ -139,7 +162,13 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> OutputTensorBlockReader; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef + typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims, + Layout, Index> + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -199,8 +228,9 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( - std::vector<internal::TensorOpResourceRequirements>* resources) const { - m_impl.getResourceRequirements(resources); + std::vector<internal::TensorOpResourceRequirements>*) const { + // TODO(ezhulenev): If we'll ever support block evaluation without raw + // access we'll need to get requirements from `m_impl`. } // required in block(OutputTensorBlock* output_block) const @@ -334,6 +364,26 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + eigen_assert(m_impl.data() != NULL); + eigen_assert((kind == Runtime) || + (kind == OneByN && desc.dimensions()[0] == 1) || + (kind == NByOne && desc.dimensions()[1] == 1)); + + if (kind == OneByN || kind == NByOne) { + // We can guarantee at compile time that block is just a contiguous slice + // of the underlying expression memory buffer. + return TensorBlockV2(internal::TensorBlockKind::kView, + m_impl.data() + desc.offset(), desc.dimensions()); + } else { + // This will do additional runtime checks, and in the end it might be also + // a view, or it might be a block materialized in the temporary buffer. + return TensorBlockV2::materialize(m_impl.data(), m_dimensions, desc, + scratch); + } + } + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); } @@ -365,14 +415,14 @@ template<typename NewDimensions, typename ArgType, typename Device> typedef NewDimensions Dimensions; enum { - IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - BlockAccessV2 = false, + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess, PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator<ArgType, Device>::RawAccess + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator<ArgType, Device>::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -385,18 +435,37 @@ template<typename NewDimensions, typename ArgType, typename Device> typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index> + TensorBlockDesc; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(index); } + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { this->m_impl.template writePacket<StoreMode>(index, x); } + + template <typename TensorBlock> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(this->m_impl.data() != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment< + Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index> + TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), + internal::strides<Layout>(this->dimensions()), + this->m_impl.data(), desc.offset()), + block.expr()); + } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 7b9ad7374..be2449ebd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -96,22 +96,29 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, - BlockAccess = false, - BlockAccessV2 = false, - PreferBlockAccess = false, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = true, - RawAccess = false + IsAligned = true, + PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess, + BlockAccess = false, + BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = true, + RawAccess = false }; + typedef typename internal::remove_const<Scalar>::type ScalarNoConst; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlockV2; + typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, + Layout, Index> + TensorBlockV2; //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) { // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector @@ -212,6 +219,214 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device return cost; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( + std::vector<internal::TensorOpResourceRequirements>* resources) const { + Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>( + 1, m_device.lastLevelCacheSize() / sizeof(Scalar)); + resources->push_back(internal::TensorOpResourceRequirements( + internal::kSkewedInnerDims, block_total_size_max)); + + m_impl.getResourceRequirements(resources); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + eigen_assert(m_impl.data() != NULL); + + // Check if we can reuse `desc` destination, or allocate new scratch buffer. + ScalarNoConst* materialized_output = + desc.template destination<ScalarNoConst, Layout>(); + + bool materialized_in_output; + if (materialized_output != NULL) { + desc.DropDestinationBuffer(); + materialized_in_output = true; + + } else { + const size_t materialized_output_size = desc.size() * sizeof(Scalar); + void* output_scratch_mem = scratch.allocate(materialized_output_size); + materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem); + materialized_in_output = false; + } + + static const bool IsColMajor = Layout == static_cast<int>(ColMajor); + + Index offset = desc.offset(); + + // Compute offsets in the output tensor corresponding to the desc.offset(). + DSizes<Index, NumDims> output_offsets; + for (int i = NumDims - 1; i > 0; --i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + const int stride_dim = IsColMajor ? dim : dim + 1; + output_offsets[dim] = offset / m_outputStrides[stride_dim]; + offset -= output_offsets[dim] * m_outputStrides[stride_dim]; + } + output_offsets[IsColMajor ? 0 : NumDims - 1] = offset; + + // Offsets in the input corresponding to output offsets. + DSizes<Index, NumDims> input_offsets = output_offsets; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offsets[dim] = input_offsets[dim] - m_padding[dim].first; + } + + // Compute offset in the input buffer (at this point it might be illegal and + // point outside of the input buffer, because we don't check for negative + // offsets, it will be autocorrected in the block iteration loop below). + Index input_offset = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offset += input_offsets[dim] * m_inputStrides[dim]; + } + + // Destination buffer and scratch buffer both indexed from 0 and have the + // same dimensions as the requested block (for destination buffer this + // property is guaranteed by `desc.destination()`). + Index output_offset = 0; + const DSizes<Index, NumDims> output_strides = + internal::strides<Layout>(desc.dimensions()); + + // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1` + // dimensions, skipping innermost dimension. In theory it should be possible + // to squeeze matching innermost dimensions, however in practice that did + // not show any improvements in benchmarks. Also in practice first outer + // dimension usually has padding, and will prevent squeezing. + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array<BlockIteratorState, NumDims - 1> it; + for (Index i = 0; i < NumDims - 1; ++i) { + const Index dim = IsColMajor ? i + 1 : NumDims - i - 2; + it[i].count = 0; + it[i].size = desc.dimension(dim); + + it[i].input_stride = m_inputStrides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + it[i].output_stride = output_strides[dim]; + it[i].output_span = it[i].output_stride * (it[i].size - 1); + } + + const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1; + + // Total output size. + const Index output_size = desc.size(); + + // We will fill inner dimension of this size in the output. It might be + // larger than the inner dimension in the input, so we might have to pad + // before/after we copy values from the input inner dimension. + const Index output_inner_dim_size = desc.dimension(inner_dim_idx); + + // How many values to fill with padding BEFORE reading from the input inner + // dimension. + const Index output_inner_pad_before_size = + input_offsets[inner_dim_idx] < 0 + ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), + output_inner_dim_size) + : 0; + + // How many values we can actually copy from the input inner dimension. + const Index output_inner_copy_size = numext::mini( + // Want to copy from input. + (output_inner_dim_size - output_inner_pad_before_size), + // Can copy from input. + (static_cast<Index>(m_impl.dimensions()[inner_dim_idx]) - + numext::maxi(input_offsets[inner_dim_idx], Index(0)))); + + // How many values to fill with padding AFTER reading from the input inner + // dimension. + const Index output_inner_pad_after_size = + (output_inner_dim_size - output_inner_copy_size - + output_inner_pad_before_size); + + // Sanity check, sum of all sizes must be equal to the output size. + eigen_assert(output_inner_dim_size == + (output_inner_pad_before_size + output_inner_copy_size + + output_inner_pad_after_size)); + + // Keep track of current coordinates and padding in the output. + DSizes<Index, NumDims> output_coord = output_offsets; + DSizes<Index, NumDims> output_padded; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + + typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy; + + // Iterate copying data from `m_impl.data()` to the output buffer. + for (Index size = 0; size < output_size; size += output_inner_dim_size) { + // Detect if we are in the padded region (exclude innermost dimension). + bool is_padded = false; + for (int j = 1; j < NumDims; ++j) { + const int dim = IsColMajor ? j : NumDims - j - 1; + is_padded = output_padded[dim]; + if (is_padded) break; + } + + if (is_padded) { + // Fill with padding value. + LinCopy::template Run<LinCopy::Kind::FillLinear>( + typename LinCopy::Dst(output_offset, 1, materialized_output), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_dim_size); + + } else { + { // Fill with padding before copying from input inner dimension. + const Index out = output_offset; + + LinCopy::template Run<LinCopy::Kind::FillLinear>( + typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_before_size); + } + + { // Copy data from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size; + const Index in = input_offset + output_inner_pad_before_size; + + LinCopy::template Run<LinCopy::Kind::Linear>( + typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Src(in, 1, m_impl.data()), + output_inner_copy_size); + } + + { // Fill with padding after copying from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size + + output_inner_copy_size; + + LinCopy::template Run<LinCopy::Kind::FillLinear>( + typename LinCopy::Dst(out, 1, materialized_output), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_after_size); + } + } + + for (int j = 0; j < NumDims - 1; ++j) { + const int dim = IsColMajor ? j + 1 : NumDims - j - 2; + + if (++it[j].count < it[j].size) { + input_offset += it[j].input_stride; + output_offset += it[j].output_stride; + output_coord[dim] += 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + break; + } + it[j].count = 0; + input_offset -= it[j].input_span; + output_offset -= it[j].output_span; + output_coord[dim] -= it[j].size - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + } + + return TensorBlockV2(materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + materialized_output, + desc.dimensions()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL @@ -222,6 +437,23 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device #endif private: + struct BlockIteratorState { + BlockIteratorState() + : count(0), + size(0), + input_stride(0), + input_span(0), + output_stride(0), + output_span(0) {} + + Index count; + Index size; + Index input_stride; + Index input_span; + Index output_stride; + Index output_span; + }; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( Index index, int dim_index) const { #if defined(EIGEN_HAS_INDEX_LIST) @@ -410,6 +642,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device PaddingDimensions m_padding; Scalar m_paddingValue; + + const Device EIGEN_DEVICE_REF m_device; }; |