From ef9dfee7bdc8e0d82c9b7ddf9414ef99d866d7ba Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Tue, 24 Sep 2019 12:52:45 -0700 Subject: Tensor block evaluation V2 support for unary/binary/broadcsting --- unsupported/Eigen/CXX11/Tensor | 1 + unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 10 + .../Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h | 5 + unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h | 37 +++ unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 5 + .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 367 +++++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorChipping.h | 5 + .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 5 + .../Eigen/CXX11/src/Tensor/TensorConversion.h | 5 + .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 10 + .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 5 + .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 10 + .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 13 +- unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h | 5 + .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 190 +++++++++-- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 172 ++++++++-- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 5 + .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 5 + .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 19 +- .../CXX11/src/Tensor/TensorForwardDeclarations.h | 46 ++- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 5 + .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 5 + .../Eigen/CXX11/src/Tensor/TensorInflation.h | 5 + .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 10 + .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 30 ++ unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 5 + unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 5 + .../Eigen/CXX11/src/Tensor/TensorReduction.h | 5 + unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 17 +- unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h | 10 + unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 5 + .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 10 + .../Eigen/CXX11/src/Tensor/TensorStriding.h | 5 + unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h | 5 + .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 5 + 36 files changed, 1003 insertions(+), 56 deletions(-) (limited to 'unsupported/Eigen') diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 5d18aeb3f..04b20f464 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -114,6 +114,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorBase.h" #include "src/Tensor/TensorBlock.h" +#include "src/Tensor/TensorBlockV2.h" #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 05e7963f0..5cb5b7a2e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -89,12 +89,17 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { } @@ -226,12 +231,17 @@ struct TensorEvaluator, Devi IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator >, Device>::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h index 5110e99ee..e6d8e7f91 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h @@ -109,12 +109,17 @@ struct TensorEvaluator, Sy IsAligned = false, PacketAccess = false, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device) : m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()), m_stride_div(op.stride_div()){} diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 270ad974e..29aa7a97e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -110,6 +110,8 @@ struct TensorEvaluator, Device> TensorEvaluator::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess & TensorEvaluator::BlockAccess, + BlockAccessV2 = TensorEvaluator::BlockAccessV2 & + TensorEvaluator::BlockAccessV2, PreferBlockAccess = TensorEvaluator::PreferBlockAccess | TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -120,6 +122,18 @@ struct TensorEvaluator, Device> typename internal::remove_const::type, Index, NumDims, Layout> TensorBlock; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlockV2 + RightTensorBlock; + + typedef internal::TensorBlockAssignment< + Scalar, NumDims, typename RightTensorBlock::XprType, Index> + TensorBlockAssignment; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) @@ -214,6 +228,29 @@ struct TensorEvaluator, Device> m_leftImpl.writeBlock(*block); } } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2( + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + if (TensorEvaluator::RawAccess && + m_leftImpl.data() != NULL) { + // If destination has raw data access, we pass it as a potential + // destination for a block descriptor evaluation. + desc.AddDestinationBuffer( + /*dst_base=*/m_leftImpl.data() + desc.offset(), + /*dst_strides=*/internal::strides(m_leftImpl.dimensions()), + /*total_dst_bytes=*/ + (internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar))); + } + + RightTensorBlock block = m_rightImpl.blockV2(desc, scratch); + // If block was evaluated into a destination, there is no need to do + // assignment. + if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { + m_leftImpl.writeBlockV2(desc, block); + } + block.cleanup(); + } + #ifdef EIGEN_USE_SYCL // binding placeholder accessors to a command group handler for SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index c8a8b16db..a8e7a8d7b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -1025,6 +1025,11 @@ class TensorBlockMapper { return m_block_dim_sizes.TotalSize(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& + block_dim_sizes() const { + return m_block_dim_sizes; + } + private: static Dimensions BlockDimensions(const Dimensions& tensor_dims, const TensorBlockShapeType block_shape, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index b290de311..9e4fae99a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -115,6 +115,7 @@ struct TensorEvaluator, Device> IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess, + BlockAccessV2 = TensorEvaluator::BlockAccessV2, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, RawAccess = false @@ -131,11 +132,24 @@ struct TensorEvaluator, Device> // We do block based broadcasting using a trick with 2x tensor rank and 0 // strides. See block method implementation for details. typedef DSizes BroadcastDimensions; + typedef internal::TensorBlock BroadcastTensorBlock; typedef internal::TensorBlockReader BroadcastTensorBlockReader; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlockV2 + ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock + TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : isCopy(false), nByOne(false), oneByN(false), @@ -867,6 +881,292 @@ struct TensorEvaluator, Device> } } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + static const bool + is_col_major = static_cast(Layout) == static_cast(ColMajor); + + // Return a block with a single scalar. + if (NumDims <= 0) return scalarBlock(scratch); + + // Because we only support kSkewedInnerDims blocking, block size should be + // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size + // for the first outer dim, and 1 for other outer dims. This is guaranteed + // by MergeResourceRequirements() in TensorBlock.h. + const Dimensions& output_dims = desc.dimensions(); + const Dimensions output_strides = internal::strides(output_dims); + + // Find where outer dims start. + int outer_dim_start = 0; + Index outer_dim_size = 1; + Index inner_dim_size = 1; + + for (int i = 0; i < NumDims; ++i) { + const int dim = is_col_major ? i : NumDims - i - 1; + + if (i > outer_dim_start) { + eigen_assert(output_dims[dim] == 1); + } else if (output_dims[dim] != m_dimensions[dim]) { + eigen_assert(output_dims[dim] < m_dimensions[dim]); + outer_dim_size = output_dims[dim]; + } else { + inner_dim_size *= output_dims[dim]; + ++outer_dim_start; + } + } + + if (inner_dim_size == 0 || outer_dim_size == 0) { + return emptyBlock(); + } + + const Dimensions& input_dims = Dimensions(m_impl.dimensions()); + + // Pre-fill input_block_sizes, broadcast_block_sizes, + // broadcast_block_strides, and broadcast_tensor_strides. Later on we will + // only modify the outer_dim_start-th dimension on these arrays. + + // Calculate the input block size for looking into the input. + Dimensions input_block_sizes; + for (int i = 0; i < outer_dim_start; ++i) { + const int dim = is_col_major ? i : NumDims -i - 1; + input_block_sizes[dim] = input_dims[dim]; + } + for (int i = outer_dim_start; i < NumDims; ++i) { + const int dim = is_col_major ? i : NumDims -i - 1; + input_block_sizes[dim] = 1; + } + Dimensions input_block_strides = + internal::strides(input_block_sizes); + + // Broadcast with the 0-stride trick: Create 1 extra dim for each + // broadcast, set the input stride to 0. + // + // When ColMajor: + // + // - bcast_block_sizes: + // [d_0, b_0, d_1, b_1, ...] + // + // - bcast_block_strides: + // [output_block_strides[0], output_block_strides[0] * d_0, + // output_block_strides[1], output_block_strides[1] * d_1, + // ...] + // + // - bcast_input_strides: + // [input_block_strides[0], 0, + // input_block_strides[1], 0, + // ...]. + // + BroadcastDimensions bcast_block_sizes; + BroadcastDimensions bcast_block_strides; + BroadcastDimensions bcast_input_strides; + + for (int i = 0; i < outer_dim_start; ++i) { + const int dim = is_col_major ? i : NumDims - i - 1; + + const int copy_dim = is_col_major ? 2 * i : 2 * NumDims - 2 * i - 1; + const int broadcast_dim = is_col_major ? copy_dim + 1 : copy_dim - 1; + + bcast_block_sizes[copy_dim] = input_dims[dim]; + bcast_block_sizes[broadcast_dim] = m_broadcast[dim]; + bcast_block_strides[copy_dim] = output_strides[dim]; + bcast_block_strides[broadcast_dim] = + output_strides[dim] * input_dims[dim]; + bcast_input_strides[copy_dim] = input_block_strides[dim]; + bcast_input_strides[broadcast_dim] = 0; + } + for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) { + const int dim = is_col_major ? i : 2 * NumDims - i - 1; + bcast_block_sizes[dim] = 1; + bcast_block_strides[dim] = 0; + bcast_input_strides[dim] = 0; + } + + const int outer_dim = + is_col_major ? outer_dim_start : NumDims - outer_dim_start - 1; + + // Check if we can reuse `desc` destination, or allocate new scratch buffer. + ScalarNoConst* materialized_output = + desc.template destination(); + bool materialized_in_output; + + if (materialized_output != NULL) { + desc.DropDestinationBuffer(); + materialized_in_output = true; + + } else { + materialized_in_output = false; + const size_t materialized_output_size = desc.size() * sizeof(Scalar); + void* output_scratch_mem = scratch.allocate(materialized_output_size); + materialized_output = static_cast(output_scratch_mem); + } + + size_t materialized_input_size = 0; + ScalarNoConst* materialized_input = NULL; + + if (outer_dim_size == 1) { + // We just need one block read using the ready-set values above. + BroadcastBlockV2( + input_block_sizes, input_block_strides, bcast_block_sizes, + bcast_block_strides, bcast_input_strides, 0, desc, scratch, + materialized_output, &materialized_input, &materialized_input_size); + + } else if (input_dims[outer_dim] == 1) { + // Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size. + const int broadcast_outer_dim = + is_col_major ? 2 * outer_dim_start + 1 + : 2 * NumDims - 2 * outer_dim_start - 2; + + bcast_block_sizes[broadcast_outer_dim] = outer_dim_size; + bcast_input_strides[broadcast_outer_dim] = 0; + bcast_block_strides[broadcast_outer_dim] = output_strides[outer_dim]; + + BroadcastBlockV2( + input_block_sizes, input_block_strides, bcast_block_sizes, + bcast_block_strides, bcast_input_strides, 0, desc, scratch, + materialized_output, &materialized_input, &materialized_input_size); + + } else { + // The general case. Let's denote the output block as x[..., + // a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on + // the outer_dim_start-th dimension (< NumDims). We need to split the + // a:a+outer_dim_size into possibly 3 sub-blocks: + // + // (1) a:b, where b is the smallest multiple of + // input_dims[outer_dim_start] in [a, a+outer_dim_size]. + // + // (2) b:c, where c is the largest multiple of input_dims[outer_dim_start] + // in [a, a+outer_dim_size]. + // + // (3) c:a+outer_dim_size . + // + // Or, when b and c do not exist, we just need to process the whole block + // together. + + // Find a. + const Index outer_dim_left_index = + desc.offset() / m_outputStrides[outer_dim]; + + // Find b and c. + const Index input_outer_dim_size = input_dims[outer_dim]; + + // First multiple after a. This is b when <= outer_dim_left_index + + // outer_dim_size. + const Index first_multiple = + divup(outer_dim_left_index, input_outer_dim_size) * + input_outer_dim_size; + + if (first_multiple <= outer_dim_left_index + outer_dim_size) { + // b exists, so does c. Find it. + const Index last_multiple = (outer_dim_left_index + outer_dim_size) / + input_outer_dim_size * input_outer_dim_size; + const int copy_outer_dim = is_col_major + ? 2 * outer_dim_start + : 2 * NumDims - 2 * outer_dim_start - 1; + const int broadcast_outer_dim = + is_col_major ? 2 * outer_dim_start + 1 + : 2 * NumDims - 2 * outer_dim_start - 2; + + if (first_multiple > outer_dim_left_index) { + const Index head_size = first_multiple - outer_dim_left_index; + input_block_sizes[outer_dim] = head_size; + bcast_block_sizes[copy_outer_dim] = head_size; + bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim]; + bcast_block_strides[copy_outer_dim] = output_strides[outer_dim]; + bcast_block_sizes[broadcast_outer_dim] = 1; + bcast_input_strides[broadcast_outer_dim] = 0; + bcast_block_strides[broadcast_outer_dim] = + output_strides[outer_dim] * input_dims[outer_dim]; + + BroadcastBlockV2(input_block_sizes, input_block_strides, + bcast_block_sizes, bcast_block_strides, + bcast_input_strides, 0, desc, scratch, + materialized_output, &materialized_input, + &materialized_input_size); + } + if (first_multiple < last_multiple) { + input_block_sizes[outer_dim] = input_outer_dim_size; + bcast_block_sizes[copy_outer_dim] = input_outer_dim_size; + bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim]; + bcast_block_strides[copy_outer_dim] = output_strides[outer_dim]; + bcast_block_sizes[broadcast_outer_dim] = + (last_multiple - first_multiple) / input_outer_dim_size; + bcast_input_strides[broadcast_outer_dim] = 0; + bcast_block_strides[broadcast_outer_dim] = + output_strides[outer_dim] * input_dims[outer_dim]; + const Index offset = (first_multiple - outer_dim_left_index) * + m_outputStrides[outer_dim]; + + BroadcastBlockV2(input_block_sizes, input_block_strides, + bcast_block_sizes, bcast_block_strides, + bcast_input_strides, offset, desc, scratch, + materialized_output, &materialized_input, + &materialized_input_size); + } + if (last_multiple < outer_dim_left_index + outer_dim_size) { + const Index tail_size = + outer_dim_left_index + outer_dim_size - last_multiple; + input_block_sizes[outer_dim] = tail_size; + bcast_block_sizes[copy_outer_dim] = tail_size; + bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim]; + bcast_block_strides[copy_outer_dim] = output_strides[outer_dim]; + bcast_block_sizes[broadcast_outer_dim] = 1; + bcast_input_strides[broadcast_outer_dim] = 0; + bcast_block_strides[broadcast_outer_dim] = + output_strides[outer_dim] * input_dims[outer_dim]; + const Index offset = (last_multiple - outer_dim_left_index) * + m_outputStrides[outer_dim]; + + BroadcastBlockV2(input_block_sizes, input_block_strides, + bcast_block_sizes, bcast_block_strides, + bcast_input_strides, offset, desc, scratch, + materialized_output, &materialized_input, + &materialized_input_size); + } + } else { + // b and c do not exist. + const int copy_outer_dim = is_col_major + ? 2 * outer_dim_start + : 2 * NumDims - 2 * outer_dim_start - 1; + input_block_sizes[outer_dim] = outer_dim_size; + bcast_block_sizes[copy_outer_dim] = outer_dim_size; + bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim]; + bcast_block_strides[copy_outer_dim] = output_strides[outer_dim]; + + BroadcastBlockV2( + input_block_sizes, input_block_strides, bcast_block_sizes, + bcast_block_strides, bcast_input_strides, 0, desc, scratch, + materialized_output, &materialized_input, &materialized_input_size); + } + } + + return TensorBlockV2(materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + materialized_output, + desc.dimensions()); + } + + // This is a special case for `NumDims == 0`, in practice this should not + // happen often, so it's fine to do memory allocation just for a scalar. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + scalarBlock(TensorBlockScratch& scratch) const { + void* mem = scratch.allocate(sizeof(Scalar)); + ScalarNoConst* buf = static_cast(mem); + *buf = m_impl.coeff(0); + + DSizes dimensions; + for (int i = 0; i < NumDims; ++i) dimensions[i] = 0; + + return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch, buf, + dimensions); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 emptyBlock() const { + DSizes dimensions; + for (int i = 0; i < NumDims; ++i) dimensions[i] = 0; + return TensorBlockV2(internal::TensorBlockKind::kView, NULL, dimensions); + } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } @@ -901,6 +1201,73 @@ struct TensorEvaluator, Device> BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data()); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlockV2( + const Dimensions& input_block_sizes, + const Dimensions& input_block_strides, + const BroadcastDimensions& bcast_block_sizes, + const BroadcastDimensions& bcast_block_strides, + const BroadcastDimensions& bcast_input_strides, Index offset, + const TensorBlockDesc& output_desc, TensorBlockScratch& scratch, + ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, + size_t* materialized_input_size) const { + // ---------------------------------------------------------------------- // + // Tensor block descriptor for reading block from the input. + const Index input_offset = output_desc.offset() + offset; + static const bool is_col_major = static_cast(Layout) == static_cast(ColMajor); + TensorBlockDesc input_desc(is_col_major + ? indexColMajor(input_offset) + : indexRowMajor(input_offset), + input_block_sizes); + + ArgTensorBlock input_block = m_impl.blockV2(input_desc, scratch); + + // ---------------------------------------------------------------------- // + // Materialize input block into a temporary memory buffer only if it's not + // already available in the arg block. + const ScalarNoConst* input_buffer = NULL; + + if (input_block.data() != NULL) { + // Input block already has raw data, there is no need to materialize it. + input_buffer = input_block.data(); + + } else { + // Otherwise we have to do block assignment into a temporary buffer. + + // Maybe reuse previously allocated buffer, or allocate a new one with a + // scratch allocator. + const size_t input_total_size = input_block_sizes.TotalSize(); + if (*materialized_input == NULL || + *materialized_input_size < input_total_size) { + *materialized_input_size = input_total_size; + void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar)); + *materialized_input = static_cast(mem); + } + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + + typename TensorBlockAssignment::Dst assignment_dst( + input_block_sizes, input_block_strides, *materialized_input); + + TensorBlockAssignment::Run(assignment_dst, input_block.expr()); + + input_buffer = *materialized_input; + } + + // ---------------------------------------------------------------------- // + // Copy data from materialized input block to the materialized output, using + // given broadcast strides (strides with zeroes). + typedef internal::TensorBlockIOV2 + TensorBlockIOV2; + + typename TensorBlockIOV2::Src src(bcast_input_strides, input_buffer); + typename TensorBlockIOV2::Dst dst(bcast_block_sizes, bcast_block_strides, + materialized_output + offset); + + TensorBlockIOV2::Copy(dst, src); + } + protected: const Device EIGEN_DEVICE_REF m_device; const typename internal::remove_reference::type m_broadcast; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index b630e6867..8860840a7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -149,6 +149,7 @@ struct TensorEvaluator, Device> Layout = TensorEvaluator::Layout, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess, + BlockAccessV2 = false, // Chipping of outer-most dimension is a trivial operation, because we can // read and write directly from the underlying tensor using single offset. IsOuterChipping = (static_cast(Layout) == ColMajor && DimId == NumInputDims - 1) || @@ -169,6 +170,10 @@ struct TensorEvaluator, Device> typedef internal::TensorBlock OutputTensorBlock; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 292a1bae1..c24e74ec6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -125,11 +125,16 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) { @@ -287,7 +292,7 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) : Base(op, device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 2f8656fbb..d61209133 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -376,12 +376,17 @@ struct TensorContractionEvaluatorBase IsAligned = true, PacketAccess = (PacketType::size > 1), BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + // Most of the code is assuming that both input tensors are ColMajor. If the // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: // If we want to compute A * B = C, where A is LHS and B is RHS, the code diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index fa329bfe6..a8160e17e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -303,11 +303,16 @@ struct TensorEvaluator, Device> internal::type_casting_traits::VectorizedCast, #endif BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 25e1e5896..8220038c1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -310,12 +310,17 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) { @@ -783,12 +788,17 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index e79958fc9..b660242f4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -243,12 +243,17 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device) : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 723d2b082..f1f46161e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -96,12 +96,17 @@ struct TensorEvaluator, Devi IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) : m_op(op), m_device(device), m_result(NULL) { @@ -265,12 +270,17 @@ struct TensorEvaluator::size > 1), BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_op(op), m_device(device), m_result(NULL) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index dbf5af094..d7bebd30b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -383,8 +383,17 @@ struct DSizes : array { } }; - - +template +std::ostream& operator<<(std::ostream& os, + const DSizes& dims) { + os << "["; + for (int i = 0; i < NumDims; ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} // Boilerplate namespace internal { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 60a07d6eb..bf7522682 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -111,6 +111,7 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = true, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -124,6 +125,10 @@ struct TensorEvaluator, Device> CoeffReturnType, Index, internal::traits::NumDimensions, Layout> TensorBlockReader; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){} diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index fec735868..c87075a72 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -43,13 +43,14 @@ struct TensorEvaluator internal::traits::NumDimensions : 0; enum { - IsAligned = Derived::IsAligned, - PacketAccess = (PacketType::size > 1), - BlockAccess = internal::is_arithmetic::type>::value, - PreferBlockAccess = false, - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::type>::value, + BlockAccessV2 = internal::is_arithmetic::type>::value, + PreferBlockAccess = false, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true }; typedef typename internal::TensorBlock< @@ -62,9 +63,13 @@ struct TensorEvaluator typename internal::remove_const::type, Index, NumCoords, Layout> TensorBlockWriter; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(device.get((const_cast(m.data())))), - m_dims(m.dimensions()), + : m_data(device.get((const_cast(m.data())))), + m_dims(m.dimensions()), m_device(device) { } @@ -162,6 +167,22 @@ struct TensorEvaluator TensorBlockWriter::Run(block, m_data); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2( + const TensorBlockDesc& desc, const TensorBlockV2& block) { + assert(m_data != NULL); + + typedef typename TensorBlockV2::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment + TensorBlockAssign; + typename TensorBlockAssign::Dst dst(desc.dimensions(), + internal::strides(m_dims), + m_data, desc.offset()); + + TensorBlockAssign::Run(dst, block.expr()); + } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } #ifdef EIGEN_USE_SYCL @@ -220,28 +241,43 @@ struct TensorEvaluator typedef StorageMemory Storage; typedef typename Storage::Type EvaluatorPointerType; + typedef typename internal::remove_const::type ScalarNoConst; + // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; static const int PacketSize = PacketType::size; enum { - IsAligned = Derived::IsAligned, - PacketAccess = (PacketType::size > 1), - BlockAccess = internal::is_arithmetic::type>::value, + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, + BlockAccessV2 = internal::is_arithmetic::value, PreferBlockAccess = false, - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true }; - typedef typename internal::TensorBlock< - typename internal::remove_const::type, Index, NumCoords, Layout> + typedef typename internal::TensorBlock TensorBlock; - typedef typename internal::TensorBlockReader< - typename internal::remove_const::type, Index, NumCoords, Layout> + typedef typename internal::TensorBlockReader TensorBlockReader; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef internal::TensorBlockIOV2 + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + typedef typename internal::TensorMaterializedBlock + TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) { } @@ -310,6 +346,67 @@ struct TensorEvaluator TensorBlockReader::Run(block, m_data); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + assert(m_data != NULL); + + // TODO(ezhulenev): Move it to TensorBlockV2 and reuse in TensorForcedEval. + + // If a tensor block descriptor covers a contiguous block of the underlying + // memory, we can skip block buffer memory allocation, and construct a block + // from existing `m_data` memory buffer. + // + // Example: (RowMajor layout) + // m_dims: [11, 12, 13, 14] + // desc.dimensions(): [1, 1, 3, 14] + // + // In this case we can construct a TensorBlock starting at + // `m_data + desc.offset()`, with a `desc.dimensions()` block sizes. + + static const bool + is_col_major = static_cast(Layout) == static_cast(ColMajor); + + // Find out how many inner dimensions have a matching size. + int num_matching_inner_dims = 0; + for (int i = 0; i < NumCoords; ++i) { + int dim = is_col_major ? i : NumCoords - i - 1; + if (m_dims[dim] != desc.dimensions()[dim]) break; + ++num_matching_inner_dims; + } + + // All the outer dimensions must be of size `1`, except a single dimension + // before the matching inner dimension (`3` in the example above). + bool can_use_direct_access = true; + for (int i = num_matching_inner_dims + 1; i < NumCoords; ++i) { + int dim = is_col_major ? i : NumCoords - i - 1; + if (desc.dimension(dim) != 1) { + can_use_direct_access = false; + break; + } + } + + if (can_use_direct_access) { + EvaluatorPointerType block_start = m_data + desc.offset(); + return TensorBlockV2(internal::TensorBlockKind::kView, block_start, + desc.dimensions()); + + } else { + void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); + ScalarNoConst* block_buffer = static_cast(mem); + + TensorBlockIOSrc src(internal::strides(m_dims), m_data, + desc.offset()); + TensorBlockIODst dst(desc.dimensions(), + internal::strides(desc.dimensions()), + block_buffer); + + TensorBlockIO::Copy(dst, src); + + return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch, + block_buffer, desc.dimensions()); + } + } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } #ifdef EIGEN_USE_SYCL // binding placeholder accessors to a command group handler for SYCL @@ -355,12 +452,17 @@ struct TensorEvaluator, Device> #endif , BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } @@ -421,6 +523,7 @@ struct TensorEvaluator, Device> PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess, + BlockAccessV2 = TensorEvaluator::BlockAccessV2, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -446,6 +549,17 @@ struct TensorEvaluator, Device> typedef internal::TensorBlock TensorBlock; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlockV2 + ArgTensorBlock; + + typedef internal::TensorCwiseUnaryBlock + TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { @@ -505,6 +619,11 @@ struct TensorEvaluator, Device> arg_block.data()); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 + blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const { + return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor); + } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } #ifdef EIGEN_USE_SYCL @@ -537,6 +656,8 @@ struct TensorEvaluator::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess & TensorEvaluator::BlockAccess, + BlockAccessV2 = TensorEvaluator::BlockAccessV2 & + TensorEvaluator::BlockAccessV2, PreferBlockAccess = TensorEvaluator::PreferBlockAccess | TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -571,6 +692,20 @@ struct TensorEvaluator::Layout> TensorBlock; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlockV2 + LeftTensorBlock; + typedef typename TensorEvaluator::TensorBlockV2 + RightTensorBlock; + + typedef internal::TensorCwiseBinaryBlock + TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { // TODO: use right impl instead if right impl dimensions are known at compile time. @@ -642,6 +777,13 @@ struct TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -709,6 +852,10 @@ struct TensorEvaluator Storage; typedef typename Storage::Type EvaluatorPointerType; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { // TODO: use arg2 or arg3 dimensions if they are known at compile time. @@ -780,6 +927,7 @@ struct TensorEvaluator PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & PacketType::HasBlend, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -805,6 +953,10 @@ struct TensorEvaluator typedef StorageMemory Storage; typedef typename Storage::Type EvaluatorPointerType; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { // TODO: use then or else impl instead if they happen to be known at compile time. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index cf07656b3..a7cb8dc97 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -23,7 +23,7 @@ namespace Eigen { * * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and * instructions) - * @tparam Tileable can use block based tensor evaluation + * @tparam Tiling can use block based tensor evaluation * (see TensorBlock.h) */ namespace internal { @@ -76,8 +76,13 @@ struct ExpressionHasTensorBroadcastingOp< * Default strategy: the expression is evaluated sequentially with a single cpu * thread, without vectorization and block evaluation. */ +#if EIGEN_HAS_CXX11 template + TiledEvaluation Tiling> +#else + template +#endif class TensorExecutor { public: typedef typename Expression::Index StorageIndex; @@ -109,8 +114,8 @@ class TensorAsyncExecutor {}; * Process all the data with a single cpu thread, using vectorized instructions. */ template -class TensorExecutor { +class TensorExecutor { public: typedef typename Expression::Index StorageIndex; @@ -152,7 +157,7 @@ class TensorExecutor class TensorExecutor { + /*Tiling=*/TiledEvaluation::Legacy> { public: typedef typename traits::Scalar Scalar; typedef typename remove_const::type ScalarNoConst; @@ -176,8 +181,7 @@ class TensorExecutor::value) { // TODO(andydavis) Reduce block management overhead for small tensors. - internal::TensorExecutor::run(expr, device); + internal::TensorExecutor::run(expr,device); evaluator.cleanup(); return; } @@ -211,6 +215,70 @@ class TensorExecutor +class TensorExecutor { + public: + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; + + static const int NumDims = traits::NumDimensions; + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + typedef TensorBlock TensorBlock; + typedef TensorBlockMapper TensorBlockMapper; + typedef typename TensorBlock::Dimensions TensorBlockDimensions; + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + Evaluator evaluator(expr, device); + Index total_size = array_prod(evaluator.dimensions()); + Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar); + + // TODO(ezhulenev): Do not use tiling for small tensors? + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + + if (needs_assign) { + // Size tensor blocks to fit in cache (or requested target block size). + Index block_total_size = numext::mini(cache_size, total_size); + TensorBlockShapeType block_shape = kSkewedInnerDims; + // Query expression tree for desired block size/shape. + std::vector resources; + evaluator.getResourceRequirements(&resources); + MergeResourceRequirements(resources, &block_shape, &block_total_size); + + TensorBlockMapper block_mapper( + TensorBlockDimensions(evaluator.dimensions()), block_shape, + block_total_size); + block_total_size = block_mapper.block_dims_total_size(); + + // Share scratch memory allocator between all blocks. + TensorBlockScratch scratch(device); + + const StorageIndex total_block_count = block_mapper.total_block_count(); + for (StorageIndex i = 0; i < total_block_count; ++i) { + TensorBlock block = block_mapper.GetBlockForIndex(i, NULL); + + TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes()); + evaluator.evalBlockV2(desc, scratch); + scratch.reset(); + } + } + evaluator.cleanup(); + } +}; + /** * Multicore strategy: the index space is partitioned and each partition is * executed on a single core. @@ -256,10 +324,11 @@ struct TensorExecutorTilingContext { }; // Computes a block evaluation parameters, and allocates temporary memory buffer -// for blocks. See TensorExecutor/TensorAsyncExecutor (Tileable=true) below. +// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. template TensorExecutorTilingContext GetTensorExecutorTilingContext( - const ThreadPoolDevice& device, const Evaluator& evaluator) { + const ThreadPoolDevice& device, const Evaluator& evaluator, + bool allocate_buffer = true) { // Prefer blocks skewed toward inner dimension. TensorBlockShapeType block_shape = kSkewedInnerDims; Index block_total_size = 0; @@ -284,7 +353,13 @@ TensorExecutorTilingContext GetTensorExecutorTilingContext( const size_t aligned_blocksize = align * divup(block_size * sizeof(typename Evaluator::Scalar), align); - void* buf = device.allocate((num_threads + 1) * aligned_blocksize); + + // TODO(ezhulenev): In new block evaluation framework there is no need for + // allocating temporary buffers, remove this after migration. + void* buf = NULL; + if (allocate_buffer) { + buf = device.allocate((num_threads + 1) * aligned_blocksize); + } return {block_mapper, cost * block_size, buf, aligned_blocksize}; } @@ -344,8 +419,8 @@ struct EvalRange { } }; -template -class TensorExecutor { +template +class TensorExecutor { public: typedef typename Expression::Index StorageIndex; @@ -369,7 +444,8 @@ class TensorExecutor { }; template -class TensorExecutor { +class TensorExecutor { public: typedef typename traits::Index StorageIndex; typedef typename traits::Scalar Scalar; @@ -387,11 +463,12 @@ class TensorExecutor::value) { + if (total_size < cache_size && + !ExpressionHasTensorBroadcastingOp::value) { // TODO(andydavis) Reduce block management overhead for small tensors. internal::TensorExecutor::run(expr, device); + /*Tiling=*/TiledEvaluation::Off>::run(expr, + device); evaluator.cleanup(); return; } @@ -419,6 +496,57 @@ class TensorExecutor +class TensorExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + static const int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper + BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor + TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const ThreadPoolDevice& device) { + Evaluator evaluator(expr, device); + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const TilingContext tiling = + internal::GetTensorExecutorTilingContext( + device, evaluator, /*allocate_buffer=*/false); + + auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, + IndexType lastBlockIdx) { + TensorBlockScratch scratch(device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) { + auto block = tiling.block_mapper.GetBlockForIndex(block_idx, nullptr); + TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes()); + evaluator.evalBlockV2(desc, scratch); + scratch.reset(); + } + }; + + device.parallelFor(tiling.block_mapper.total_block_count(), tiling.cost, + eval_block); + } + evaluator.cleanup(); + } +}; + template class TensorAsyncExecutor -class TensorExecutor { +template +class TensorExecutor { public: typedef typename Expression::Index StorageIndex; static void run(const Expression& expr, const GpuDevice& device); @@ -612,8 +740,8 @@ EigenMetaKernel(Evaluator eval, StorageIndex size) { } /*static*/ -template -EIGEN_STRONG_INLINE void TensorExecutor::run( +template +EIGEN_STRONG_INLINE void TensorExecutor::run( const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); @@ -711,8 +839,8 @@ struct ExecExprFunctorKernel range_, vectorizable_threads_, evaluator) {} }; -template -class TensorExecutor { +template +class TensorExecutor { public: typedef typename Expression::Index Index; static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 8d1a6d9cc..7be007d94 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -138,12 +138,17 @@ struct TensorEvaluator, D IsAligned = false, PacketAccess = true, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index 71ba56773..5f06c97ab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -42,12 +42,17 @@ class TensorFixedSize : public TensorBase0), PacketAccess = (internal::packet_traits::size > 1), BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, RawAccess = true }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + typedef Dimensions_ Dimensions; static const std::size_t NumIndices = Dimensions::count; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index efd6a7557..8d45bd62a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -97,6 +97,7 @@ struct TensorEvaluator, Device> IsAligned = true, PacketAccess = (PacketType::size > 1), BlockAccess = internal::is_arithmetic::value, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = true @@ -109,8 +110,12 @@ struct TensorEvaluator, Device> CoeffReturnType, Index, internal::traits::NumDimensions, Layout> TensorBlockReader; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_op(op.expression()), + : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } @@ -132,13 +137,13 @@ struct TensorEvaluator, Device> #endif typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; EvalTo evalToTmp(m_device.get(m_buffer), m_op); - const bool Vectorize = internal::IsVectorizable::value; - const bool Tile = TensorEvaluator::BlockAccess && - TensorEvaluator::PreferBlockAccess; - internal::TensorExecutor::type, - Vectorize, Tile>::run(evalToTmp, m_device); + internal::TensorExecutor< + const EvalTo, typename internal::remove_const::type, + /*Vectorizable=*/internal::IsVectorizable::value, + /*Tiling=*/internal::IsTileable::value>:: + run(evalToTmp, m_device); + return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 772dbbe35..f33489a33 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -154,23 +154,61 @@ struct IsVectorizable { TensorEvaluator::IsAligned; }; +// Tiled evaluation strategy. +#if !EIGEN_HAS_CXX11 +// To be able to use `TiledEvaluation::Off` in C++03 we need a namespace. +// (Use of enumeration in a nested name specifier is a c++11 extension). +namespace TiledEvaluation { +#endif +enum TiledEvaluation { + Off = 0, // tiled evaluation is not supported + On = 1, // still work in progress (see TensorBlockV2.h) + Legacy = 2 // soon to be deprecated (see TensorBock.h) +}; +#if !EIGEN_HAS_CXX11 +} // namespace TiledEvaluation +#endif + template struct IsTileable { +#if !EIGEN_HAS_CXX11 + typedef TiledEvaluation::TiledEvaluation TiledEvaluation; +#endif + // Check that block evaluation is supported and it's a preferred option (at // least one sub-expression has much faster block evaluation, e.g. // broadcasting). - static const bool value = TensorEvaluator::BlockAccess && - TensorEvaluator::PreferBlockAccess; + static const bool BlockAccess = + TensorEvaluator::BlockAccess && + TensorEvaluator::PreferBlockAccess; + + static const bool BlockAccessV2 = + TensorEvaluator::BlockAccessV2 && + TensorEvaluator::PreferBlockAccess; + + + static const TiledEvaluation value = + BlockAccessV2 + ? TiledEvaluation::On + : (BlockAccess ? TiledEvaluation::Legacy : TiledEvaluation::Off); }; +#if EIGEN_HAS_CXX11 +template ::value, + TiledEvaluation Tiling = IsTileable::value> +class TensorExecutor; +#else template ::value, - bool Tileable = IsTileable::value> + TiledEvaluation::TiledEvaluation Tiling = IsTileable::value> class TensorExecutor; +#endif +// TODO(ezhulenev): Add TiledEvaluation support to async executor. template ::value, - bool Tileable = IsTileable::value> + bool Tileable = IsTileable::BlockAccess> class TensorAsyncExecutor; } // end namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index b7ad33626..639e1dbb0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -94,6 +94,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = true, + BlockAccessV2 = false, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -105,6 +106,10 @@ struct TensorEvaluator, Device> typedef internal::TensorBlock TensorBlock; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_device(device), m_generator(op.generator()) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 5ff67bdae..38bf80c5d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -232,6 +232,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = true, + BlockAccessV2 = false, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -241,6 +242,10 @@ struct TensorEvaluator, Device> typedef internal::TensorBlock OutputTensorBlock; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) : m_device(device), m_impl(op.expression(), device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index f8cda6574..e1df84a1d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -93,12 +93,17 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_strides(op.strides()) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 755170a34..f84edc6b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -120,12 +120,17 @@ struct TensorEvaluator, Device> IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { @@ -195,11 +200,16 @@ template IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false // to be implemented }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 9ab1415ac..c8333e488 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -121,6 +121,7 @@ struct TensorEvaluator, Device> BlockAccess = TensorEvaluator::BlockAccess && TensorEvaluator::RawAccess && NumInputDims > 0 && NumOutputDims > 0, + BlockAccessV2 = false, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -137,6 +138,10 @@ struct TensorEvaluator, Device> Layout> OutputTensorBlockReader; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { @@ -363,6 +368,7 @@ template IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -378,6 +384,10 @@ template typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(index); @@ -532,6 +542,7 @@ struct TensorEvaluator, Devi IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess, + BlockAccessV2 = false, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -543,6 +554,10 @@ struct TensorEvaluator, Devi typedef internal::TensorBlock TensorBlock; typedef typename TensorBlock::Dimensions TensorBlockDimensions; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { @@ -813,6 +828,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = TensorEvaluator::BlockAccess, + BlockAccessV2 = false, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, @@ -824,6 +840,10 @@ struct TensorEvaluator, Device> typedef internal::TensorBlock TensorBlock; typedef typename TensorBlock::Dimensions TensorBlockDimensions; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -1002,11 +1022,16 @@ struct TensorEvaluator::Layout, RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), @@ -1179,12 +1204,17 @@ struct TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index e98382cc1..7b9ad7374 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -99,12 +99,17 @@ struct TensorEvaluator, Device IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = true, RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 47db839db..8158aa574 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -97,12 +97,17 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 5dddfcf85..cee7ae657 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -585,6 +585,7 @@ struct TensorReductionEvaluatorBase::Layout, CoordAccess = false, // to be implemented @@ -598,6 +599,10 @@ struct TensorReductionEvaluatorBase InputTensorBlock; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; static const bool RunningFullReduction = (NumOutputDims==0); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index b92c9ffaf..87072006d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -142,12 +142,17 @@ template class TensorRef : public TensorBase, Device> IsAligned = false, PacketAccess = false, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorRef::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) : m_ref(m) { } @@ -401,7 +411,7 @@ struct TensorEvaluator, Device> } EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); } - + protected: TensorRef m_ref; }; @@ -423,10 +433,15 @@ struct TensorEvaluator, Device> : public TensorEvaluator& m, const Device& d) : Base(m, d) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 123675196..855d04eb7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -116,6 +116,7 @@ struct TensorEvaluator, Device IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = true, + BlockAccessV2 = false, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -128,6 +129,10 @@ struct TensorEvaluator, Device typedef internal::TensorBlock OutputTensorBlock; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), @@ -400,6 +405,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -413,6 +419,10 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = PacketType::size; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 44156126d..1e6fc93b1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -100,12 +100,17 @@ struct TensorEvaluator, Device> { IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = true }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index ad6332179..5e8abad75 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -116,6 +116,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = TensorEvaluator::BlockAccess, + BlockAccessV2 = false, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -129,6 +130,10 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockReader TensorBlockReader; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_device(device), @@ -426,6 +431,7 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = (PacketType::size > 1), BlockAccess = TensorEvaluator::BlockAccess, + BlockAccessV2 = false, PreferBlockAccess = true, Layout = TensorEvaluator::Layout, RawAccess = false @@ -438,6 +444,10 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockWriter TensorBlockWriter; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 3b1cbaabc..8c05704c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -115,12 +115,17 @@ struct TensorEvaluator, Device> IsAligned = /*TensorEvaluator::IsAligned*/false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index d04b1bea7..9dc7723cb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -98,12 +98,17 @@ struct TensorEvaluator, Device> IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_traceDim(1), m_device(device) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 29a2d5538..292393e9a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -184,12 +184,17 @@ struct TensorEvaluator, D IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, + BlockAccessV2 = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false }; + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlockV2; + //===--------------------------------------------------------------------===// + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { -- cgit v1.2.3