aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2019-09-24 12:52:45 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2019-09-24 12:52:45 -0700
commitef9dfee7bdc8e0d82c9b7ddf9414ef99d866d7ba (patch)
tree490a8ae1f247cf226475f504ea1d3ab305b98097 /unsupported/Eigen/CXX11/src/Tensor
parentefd9867ff0e8df23016ac6c9828d0d7bf8bec1b1 (diff)
Tensor block evaluation V2 support for unary/binary/broadcsting
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h37
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h367
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h12
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h13
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h190
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h172
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h19
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h46
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h30
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRef.h17
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h5
35 files changed, 1002 insertions, 56 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index 05e7963f0..5cb5b7a2e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -89,12 +89,17 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device) { }
@@ -226,12 +231,17 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_orig_impl(op.expression(), device),
m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
index 5110e99ee..e6d8e7f91 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMaxSycl.h
@@ -109,12 +109,17 @@ struct TensorEvaluator<const TensorTupleReducerDeviceOp<StrideDims, ArgType>, Sy
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, SyclKernelDevice>::Layout,
CoordAccess = false,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const SyclKernelDevice& device)
: m_impl(op.expression(), device), m_return_dim(op.return_dim()), m_strides(op.strides()), m_stride_mod(op.stride_mod()),
m_stride_div(op.stride_div()){}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 270ad974e..29aa7a97e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -110,6 +110,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
TensorEvaluator<RightArgType, Device>::BlockAccess,
+ BlockAccessV2 = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
+ TensorEvaluator<RightArgType, Device>::BlockAccessV2,
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
@@ -120,6 +122,18 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
typename internal::remove_const<Scalar>::type, Index, NumDims, Layout>
TensorBlock;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlockV2
+ RightTensorBlock;
+
+ typedef internal::TensorBlockAssignment<
+ Scalar, NumDims, typename RightTensorBlock::XprType, Index>
+ TensorBlockAssignment;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device)
@@ -214,6 +228,29 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
m_leftImpl.writeBlock(*block);
}
}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlockV2(
+ TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+ if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
+ m_leftImpl.data() != NULL) {
+ // If destination has raw data access, we pass it as a potential
+ // destination for a block descriptor evaluation.
+ desc.AddDestinationBuffer(
+ /*dst_base=*/m_leftImpl.data() + desc.offset(),
+ /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()),
+ /*total_dst_bytes=*/
+ (internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar)));
+ }
+
+ RightTensorBlock block = m_rightImpl.blockV2(desc, scratch);
+ // If block was evaluated into a destination, there is no need to do
+ // assignment.
+ if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+ m_leftImpl.writeBlockV2(desc, block);
+ }
+ block.cleanup();
+ }
+
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index c8a8b16db..a8e7a8d7b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -1025,6 +1025,11 @@ class TensorBlockMapper {
return m_block_dim_sizes.TotalSize();
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&
+ block_dim_sizes() const {
+ return m_block_dim_sizes;
+ }
+
private:
static Dimensions BlockDimensions(const Dimensions& tensor_dims,
const TensorBlockShapeType block_shape,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index b290de311..9e4fae99a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -115,6 +115,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
@@ -131,11 +132,24 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
// We do block based broadcasting using a trick with 2x tensor rank and 0
// strides. See block method implementation for details.
typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
+
typedef internal::TensorBlock<ScalarNoConst, Index, 2 * NumDims, Layout>
BroadcastTensorBlock;
typedef internal::TensorBlockReader<ScalarNoConst, Index, 2 * NumDims, Layout>
BroadcastTensorBlockReader;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+ ArgTensorBlock;
+
+ typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+ Layout, Index>
+ TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
: isCopy(false), nByOne(false), oneByN(false),
@@ -867,6 +881,292 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ static const bool
+ is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+ // Return a block with a single scalar.
+ if (NumDims <= 0) return scalarBlock(scratch);
+
+ // Because we only support kSkewedInnerDims blocking, block size should be
+ // equal to m_dimensions for inner dims, a smaller than m_dimensions[i] size
+ // for the first outer dim, and 1 for other outer dims. This is guaranteed
+ // by MergeResourceRequirements() in TensorBlock.h.
+ const Dimensions& output_dims = desc.dimensions();
+ const Dimensions output_strides = internal::strides<Layout>(output_dims);
+
+ // Find where outer dims start.
+ int outer_dim_start = 0;
+ Index outer_dim_size = 1;
+ Index inner_dim_size = 1;
+
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = is_col_major ? i : NumDims - i - 1;
+
+ if (i > outer_dim_start) {
+ eigen_assert(output_dims[dim] == 1);
+ } else if (output_dims[dim] != m_dimensions[dim]) {
+ eigen_assert(output_dims[dim] < m_dimensions[dim]);
+ outer_dim_size = output_dims[dim];
+ } else {
+ inner_dim_size *= output_dims[dim];
+ ++outer_dim_start;
+ }
+ }
+
+ if (inner_dim_size == 0 || outer_dim_size == 0) {
+ return emptyBlock();
+ }
+
+ const Dimensions& input_dims = Dimensions(m_impl.dimensions());
+
+ // Pre-fill input_block_sizes, broadcast_block_sizes,
+ // broadcast_block_strides, and broadcast_tensor_strides. Later on we will
+ // only modify the outer_dim_start-th dimension on these arrays.
+
+ // Calculate the input block size for looking into the input.
+ Dimensions input_block_sizes;
+ for (int i = 0; i < outer_dim_start; ++i) {
+ const int dim = is_col_major ? i : NumDims -i - 1;
+ input_block_sizes[dim] = input_dims[dim];
+ }
+ for (int i = outer_dim_start; i < NumDims; ++i) {
+ const int dim = is_col_major ? i : NumDims -i - 1;
+ input_block_sizes[dim] = 1;
+ }
+ Dimensions input_block_strides =
+ internal::strides<Layout>(input_block_sizes);
+
+ // Broadcast with the 0-stride trick: Create 1 extra dim for each
+ // broadcast, set the input stride to 0.
+ //
+ // When ColMajor:
+ //
+ // - bcast_block_sizes:
+ // [d_0, b_0, d_1, b_1, ...]
+ //
+ // - bcast_block_strides:
+ // [output_block_strides[0], output_block_strides[0] * d_0,
+ // output_block_strides[1], output_block_strides[1] * d_1,
+ // ...]
+ //
+ // - bcast_input_strides:
+ // [input_block_strides[0], 0,
+ // input_block_strides[1], 0,
+ // ...].
+ //
+ BroadcastDimensions bcast_block_sizes;
+ BroadcastDimensions bcast_block_strides;
+ BroadcastDimensions bcast_input_strides;
+
+ for (int i = 0; i < outer_dim_start; ++i) {
+ const int dim = is_col_major ? i : NumDims - i - 1;
+
+ const int copy_dim = is_col_major ? 2 * i : 2 * NumDims - 2 * i - 1;
+ const int broadcast_dim = is_col_major ? copy_dim + 1 : copy_dim - 1;
+
+ bcast_block_sizes[copy_dim] = input_dims[dim];
+ bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
+ bcast_block_strides[copy_dim] = output_strides[dim];
+ bcast_block_strides[broadcast_dim] =
+ output_strides[dim] * input_dims[dim];
+ bcast_input_strides[copy_dim] = input_block_strides[dim];
+ bcast_input_strides[broadcast_dim] = 0;
+ }
+ for (int i = 2 * outer_dim_start; i < 2 * NumDims; ++i) {
+ const int dim = is_col_major ? i : 2 * NumDims - i - 1;
+ bcast_block_sizes[dim] = 1;
+ bcast_block_strides[dim] = 0;
+ bcast_input_strides[dim] = 0;
+ }
+
+ const int outer_dim =
+ is_col_major ? outer_dim_start : NumDims - outer_dim_start - 1;
+
+ // Check if we can reuse `desc` destination, or allocate new scratch buffer.
+ ScalarNoConst* materialized_output =
+ desc.template destination<ScalarNoConst, Layout>();
+ bool materialized_in_output;
+
+ if (materialized_output != NULL) {
+ desc.DropDestinationBuffer();
+ materialized_in_output = true;
+
+ } else {
+ materialized_in_output = false;
+ const size_t materialized_output_size = desc.size() * sizeof(Scalar);
+ void* output_scratch_mem = scratch.allocate(materialized_output_size);
+ materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem);
+ }
+
+ size_t materialized_input_size = 0;
+ ScalarNoConst* materialized_input = NULL;
+
+ if (outer_dim_size == 1) {
+ // We just need one block read using the ready-set values above.
+ BroadcastBlockV2(
+ input_block_sizes, input_block_strides, bcast_block_sizes,
+ bcast_block_strides, bcast_input_strides, 0, desc, scratch,
+ materialized_output, &materialized_input, &materialized_input_size);
+
+ } else if (input_dims[outer_dim] == 1) {
+ // Broadcast outer_dim_start-th dimension (< NumDims) by outer_dim_size.
+ const int broadcast_outer_dim =
+ is_col_major ? 2 * outer_dim_start + 1
+ : 2 * NumDims - 2 * outer_dim_start - 2;
+
+ bcast_block_sizes[broadcast_outer_dim] = outer_dim_size;
+ bcast_input_strides[broadcast_outer_dim] = 0;
+ bcast_block_strides[broadcast_outer_dim] = output_strides[outer_dim];
+
+ BroadcastBlockV2(
+ input_block_sizes, input_block_strides, bcast_block_sizes,
+ bcast_block_strides, bcast_input_strides, 0, desc, scratch,
+ materialized_output, &materialized_input, &materialized_input_size);
+
+ } else {
+ // The general case. Let's denote the output block as x[...,
+ // a:a+outer_dim_size, :, ..., :], where a:a+outer_dim_size is a slice on
+ // the outer_dim_start-th dimension (< NumDims). We need to split the
+ // a:a+outer_dim_size into possibly 3 sub-blocks:
+ //
+ // (1) a:b, where b is the smallest multiple of
+ // input_dims[outer_dim_start] in [a, a+outer_dim_size].
+ //
+ // (2) b:c, where c is the largest multiple of input_dims[outer_dim_start]
+ // in [a, a+outer_dim_size].
+ //
+ // (3) c:a+outer_dim_size .
+ //
+ // Or, when b and c do not exist, we just need to process the whole block
+ // together.
+
+ // Find a.
+ const Index outer_dim_left_index =
+ desc.offset() / m_outputStrides[outer_dim];
+
+ // Find b and c.
+ const Index input_outer_dim_size = input_dims[outer_dim];
+
+ // First multiple after a. This is b when <= outer_dim_left_index +
+ // outer_dim_size.
+ const Index first_multiple =
+ divup<Index>(outer_dim_left_index, input_outer_dim_size) *
+ input_outer_dim_size;
+
+ if (first_multiple <= outer_dim_left_index + outer_dim_size) {
+ // b exists, so does c. Find it.
+ const Index last_multiple = (outer_dim_left_index + outer_dim_size) /
+ input_outer_dim_size * input_outer_dim_size;
+ const int copy_outer_dim = is_col_major
+ ? 2 * outer_dim_start
+ : 2 * NumDims - 2 * outer_dim_start - 1;
+ const int broadcast_outer_dim =
+ is_col_major ? 2 * outer_dim_start + 1
+ : 2 * NumDims - 2 * outer_dim_start - 2;
+
+ if (first_multiple > outer_dim_left_index) {
+ const Index head_size = first_multiple - outer_dim_left_index;
+ input_block_sizes[outer_dim] = head_size;
+ bcast_block_sizes[copy_outer_dim] = head_size;
+ bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
+ bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
+ bcast_block_sizes[broadcast_outer_dim] = 1;
+ bcast_input_strides[broadcast_outer_dim] = 0;
+ bcast_block_strides[broadcast_outer_dim] =
+ output_strides[outer_dim] * input_dims[outer_dim];
+
+ BroadcastBlockV2(input_block_sizes, input_block_strides,
+ bcast_block_sizes, bcast_block_strides,
+ bcast_input_strides, 0, desc, scratch,
+ materialized_output, &materialized_input,
+ &materialized_input_size);
+ }
+ if (first_multiple < last_multiple) {
+ input_block_sizes[outer_dim] = input_outer_dim_size;
+ bcast_block_sizes[copy_outer_dim] = input_outer_dim_size;
+ bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
+ bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
+ bcast_block_sizes[broadcast_outer_dim] =
+ (last_multiple - first_multiple) / input_outer_dim_size;
+ bcast_input_strides[broadcast_outer_dim] = 0;
+ bcast_block_strides[broadcast_outer_dim] =
+ output_strides[outer_dim] * input_dims[outer_dim];
+ const Index offset = (first_multiple - outer_dim_left_index) *
+ m_outputStrides[outer_dim];
+
+ BroadcastBlockV2(input_block_sizes, input_block_strides,
+ bcast_block_sizes, bcast_block_strides,
+ bcast_input_strides, offset, desc, scratch,
+ materialized_output, &materialized_input,
+ &materialized_input_size);
+ }
+ if (last_multiple < outer_dim_left_index + outer_dim_size) {
+ const Index tail_size =
+ outer_dim_left_index + outer_dim_size - last_multiple;
+ input_block_sizes[outer_dim] = tail_size;
+ bcast_block_sizes[copy_outer_dim] = tail_size;
+ bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
+ bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
+ bcast_block_sizes[broadcast_outer_dim] = 1;
+ bcast_input_strides[broadcast_outer_dim] = 0;
+ bcast_block_strides[broadcast_outer_dim] =
+ output_strides[outer_dim] * input_dims[outer_dim];
+ const Index offset = (last_multiple - outer_dim_left_index) *
+ m_outputStrides[outer_dim];
+
+ BroadcastBlockV2(input_block_sizes, input_block_strides,
+ bcast_block_sizes, bcast_block_strides,
+ bcast_input_strides, offset, desc, scratch,
+ materialized_output, &materialized_input,
+ &materialized_input_size);
+ }
+ } else {
+ // b and c do not exist.
+ const int copy_outer_dim = is_col_major
+ ? 2 * outer_dim_start
+ : 2 * NumDims - 2 * outer_dim_start - 1;
+ input_block_sizes[outer_dim] = outer_dim_size;
+ bcast_block_sizes[copy_outer_dim] = outer_dim_size;
+ bcast_input_strides[copy_outer_dim] = input_block_strides[outer_dim];
+ bcast_block_strides[copy_outer_dim] = output_strides[outer_dim];
+
+ BroadcastBlockV2(
+ input_block_sizes, input_block_strides, bcast_block_sizes,
+ bcast_block_strides, bcast_input_strides, 0, desc, scratch,
+ materialized_output, &materialized_input, &materialized_input_size);
+ }
+ }
+
+ return TensorBlockV2(materialized_in_output
+ ? internal::TensorBlockKind::kMaterializedInOutput
+ : internal::TensorBlockKind::kMaterializedInScratch,
+ materialized_output,
+ desc.dimensions());
+ }
+
+ // This is a special case for `NumDims == 0`, in practice this should not
+ // happen often, so it's fine to do memory allocation just for a scalar.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ scalarBlock(TensorBlockScratch& scratch) const {
+ void* mem = scratch.allocate(sizeof(Scalar));
+ ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
+ *buf = m_impl.coeff(0);
+
+ DSizes<Index, NumDims> dimensions;
+ for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
+
+ return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch, buf,
+ dimensions);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2 emptyBlock() const {
+ DSizes<Index, NumDims> dimensions;
+ for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
+ return TensorBlockV2(internal::TensorBlockKind::kView, NULL, dimensions);
+ }
+
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
@@ -901,6 +1201,73 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
BroadcastTensorBlockReader::Run(&broadcast_block, input_block.data());
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void BroadcastBlockV2(
+ const Dimensions& input_block_sizes,
+ const Dimensions& input_block_strides,
+ const BroadcastDimensions& bcast_block_sizes,
+ const BroadcastDimensions& bcast_block_strides,
+ const BroadcastDimensions& bcast_input_strides, Index offset,
+ const TensorBlockDesc& output_desc, TensorBlockScratch& scratch,
+ ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
+ size_t* materialized_input_size) const {
+ // ---------------------------------------------------------------------- //
+ // Tensor block descriptor for reading block from the input.
+ const Index input_offset = output_desc.offset() + offset;
+ static const bool is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+ TensorBlockDesc input_desc(is_col_major
+ ? indexColMajor(input_offset)
+ : indexRowMajor(input_offset),
+ input_block_sizes);
+
+ ArgTensorBlock input_block = m_impl.blockV2(input_desc, scratch);
+
+ // ---------------------------------------------------------------------- //
+ // Materialize input block into a temporary memory buffer only if it's not
+ // already available in the arg block.
+ const ScalarNoConst* input_buffer = NULL;
+
+ if (input_block.data() != NULL) {
+ // Input block already has raw data, there is no need to materialize it.
+ input_buffer = input_block.data();
+
+ } else {
+ // Otherwise we have to do block assignment into a temporary buffer.
+
+ // Maybe reuse previously allocated buffer, or allocate a new one with a
+ // scratch allocator.
+ const size_t input_total_size = input_block_sizes.TotalSize();
+ if (*materialized_input == NULL ||
+ *materialized_input_size < input_total_size) {
+ *materialized_input_size = input_total_size;
+ void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
+ *materialized_input = static_cast<ScalarNoConst*>(mem);
+ }
+
+ typedef internal::TensorBlockAssignment<
+ ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
+ TensorBlockAssignment;
+
+ typename TensorBlockAssignment::Dst assignment_dst(
+ input_block_sizes, input_block_strides, *materialized_input);
+
+ TensorBlockAssignment::Run(assignment_dst, input_block.expr());
+
+ input_buffer = *materialized_input;
+ }
+
+ // ---------------------------------------------------------------------- //
+ // Copy data from materialized input block to the materialized output, using
+ // given broadcast strides (strides with zeroes).
+ typedef internal::TensorBlockIOV2<ScalarNoConst, Index, 2 * NumDims, Layout>
+ TensorBlockIOV2;
+
+ typename TensorBlockIOV2::Src src(bcast_input_strides, input_buffer);
+ typename TensorBlockIOV2::Dst dst(bcast_block_sizes, bcast_block_strides,
+ materialized_output + offset);
+
+ TensorBlockIOV2::Copy(dst, src);
+ }
+
protected:
const Device EIGEN_DEVICE_REF m_device;
const typename internal::remove_reference<Broadcast>::type m_broadcast;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index b630e6867..8860840a7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -149,6 +149,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
Layout = TensorEvaluator<ArgType, Device>::Layout,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ BlockAccessV2 = false,
// Chipping of outer-most dimension is a trivial operation, because we can
// read and write directly from the underlying tensor using single offset.
IsOuterChipping = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
@@ -169,6 +170,10 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
OutputTensorBlock;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 292a1bae1..c24e74ec6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -125,11 +125,16 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
{
@@ -287,7 +292,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
+
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
@@ -318,11 +323,16 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
: Base(op, device)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 2f8656fbb..d61209133 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -376,12 +376,17 @@ struct TensorContractionEvaluatorBase
IsAligned = true,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = true
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
// Most of the code is assuming that both input tensors are ColMajor. If the
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index fa329bfe6..a8160e17e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -303,11 +303,16 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
#endif
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 25e1e5896..8220038c1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -310,12 +310,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
{
@@ -783,12 +788,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
PacketAccess = false,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index e79958fc9..b660242f4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -243,12 +243,17 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
PacketAccess = false,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
: m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index 723d2b082..f1f46161e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -96,12 +96,17 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<XprType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
: m_op(op), m_device(device), m_result(NULL)
{
@@ -265,12 +270,17 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<LhsXprType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_op(op), m_device(device), m_result(NULL)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index dbf5af094..d7bebd30b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -383,8 +383,17 @@ struct DSizes : array<DenseIndex, NumDims> {
}
};
-
-
+template <typename IndexType, int NumDims>
+std::ostream& operator<<(std::ostream& os,
+ const DSizes<IndexType, NumDims>& dims) {
+ os << "[";
+ for (int i = 0; i < NumDims; ++i) {
+ if (i > 0) os << ", ";
+ os << dims[i];
+ }
+ os << "]";
+ return os;
+}
// Boilerplate
namespace internal {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 60a07d6eb..bf7522682 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -111,6 +111,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = true,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -124,6 +125,10 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
TensorBlockReader;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index fec735868..c87075a72 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -43,13 +43,14 @@ struct TensorEvaluator
internal::traits<Derived>::NumDimensions : 0;
enum {
- IsAligned = Derived::IsAligned,
- PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
- PreferBlockAccess = false,
- Layout = Derived::Layout,
- CoordAccess = NumCoords > 0,
- RawAccess = true
+ IsAligned = Derived::IsAligned,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+ BlockAccessV2 = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+ PreferBlockAccess = false,
+ Layout = Derived::Layout,
+ CoordAccess = NumCoords > 0,
+ RawAccess = true
};
typedef typename internal::TensorBlock<
@@ -62,9 +63,13 @@ struct TensorEvaluator
typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
TensorBlockWriter;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
- : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
- m_dims(m.dimensions()),
+ : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
+ m_dims(m.dimensions()),
m_device(device)
{ }
@@ -162,6 +167,22 @@ struct TensorEvaluator
TensorBlockWriter::Run(block, m_data);
}
+ template<typename TensorBlockV2>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
+ const TensorBlockDesc& desc, const TensorBlockV2& block) {
+ assert(m_data != NULL);
+
+ typedef typename TensorBlockV2::XprType TensorBlockExpr;
+ typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
+ Index>
+ TensorBlockAssign;
+ typename TensorBlockAssign::Dst dst(desc.dimensions(),
+ internal::strides<Layout>(m_dims),
+ m_data, desc.offset());
+
+ TensorBlockAssign::Run(dst, block.expr());
+ }
+
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
#ifdef EIGEN_USE_SYCL
@@ -220,28 +241,43 @@ struct TensorEvaluator<const Derived, Device>
typedef StorageMemory<const Scalar, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
internal::traits<Derived>::NumDimensions : 0;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
- IsAligned = Derived::IsAligned,
- PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+ IsAligned = Derived::IsAligned,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = internal::is_arithmetic<ScalarNoConst>::value,
+ BlockAccessV2 = internal::is_arithmetic<ScalarNoConst>::value,
PreferBlockAccess = false,
- Layout = Derived::Layout,
- CoordAccess = NumCoords > 0,
- RawAccess = true
+ Layout = Derived::Layout,
+ CoordAccess = NumCoords > 0,
+ RawAccess = true
};
- typedef typename internal::TensorBlock<
- typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+ typedef typename internal::TensorBlock<ScalarNoConst, Index, NumCoords, Layout>
TensorBlock;
- typedef typename internal::TensorBlockReader<
- typename internal::remove_const<Scalar>::type, Index, NumCoords, Layout>
+ typedef typename internal::TensorBlockReader<ScalarNoConst, Index, NumCoords, Layout>
TensorBlockReader;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumCoords, Layout>
+ TensorBlockIO;
+ typedef typename TensorBlockIO::Dst TensorBlockIODst;
+ typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+ typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
+ Layout, Index>
+ TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
: m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
{ }
@@ -310,6 +346,67 @@ struct TensorEvaluator<const Derived, Device>
TensorBlockReader::Run(block, m_data);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ assert(m_data != NULL);
+
+ // TODO(ezhulenev): Move it to TensorBlockV2 and reuse in TensorForcedEval.
+
+ // If a tensor block descriptor covers a contiguous block of the underlying
+ // memory, we can skip block buffer memory allocation, and construct a block
+ // from existing `m_data` memory buffer.
+ //
+ // Example: (RowMajor layout)
+ // m_dims: [11, 12, 13, 14]
+ // desc.dimensions(): [1, 1, 3, 14]
+ //
+ // In this case we can construct a TensorBlock starting at
+ // `m_data + desc.offset()`, with a `desc.dimensions()` block sizes.
+
+ static const bool
+ is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+ // Find out how many inner dimensions have a matching size.
+ int num_matching_inner_dims = 0;
+ for (int i = 0; i < NumCoords; ++i) {
+ int dim = is_col_major ? i : NumCoords - i - 1;
+ if (m_dims[dim] != desc.dimensions()[dim]) break;
+ ++num_matching_inner_dims;
+ }
+
+ // All the outer dimensions must be of size `1`, except a single dimension
+ // before the matching inner dimension (`3` in the example above).
+ bool can_use_direct_access = true;
+ for (int i = num_matching_inner_dims + 1; i < NumCoords; ++i) {
+ int dim = is_col_major ? i : NumCoords - i - 1;
+ if (desc.dimension(dim) != 1) {
+ can_use_direct_access = false;
+ break;
+ }
+ }
+
+ if (can_use_direct_access) {
+ EvaluatorPointerType block_start = m_data + desc.offset();
+ return TensorBlockV2(internal::TensorBlockKind::kView, block_start,
+ desc.dimensions());
+
+ } else {
+ void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+ ScalarNoConst* block_buffer = static_cast<ScalarNoConst*>(mem);
+
+ TensorBlockIOSrc src(internal::strides<Layout>(m_dims), m_data,
+ desc.offset());
+ TensorBlockIODst dst(desc.dimensions(),
+ internal::strides<Layout>(desc.dimensions()),
+ block_buffer);
+
+ TensorBlockIO::Copy(dst, src);
+
+ return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch,
+ block_buffer, desc.dimensions());
+ }
+ }
+
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
@@ -355,12 +452,17 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
#endif
,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
@@ -421,6 +523,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess &
internal::functor_traits<UnaryOp>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -446,6 +549,17 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
TensorBlock;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+ ArgTensorBlock;
+
+ typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
+ TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
@@ -505,6 +619,11 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
arg_block.data());
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor);
+ }
+
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
@@ -537,6 +656,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
internal::functor_traits<BinaryOp>::PacketAccess,
BlockAccess = TensorEvaluator<LeftArgType, Device>::BlockAccess &
TensorEvaluator<RightArgType, Device>::BlockAccess,
+ BlockAccessV2 = TensorEvaluator<LeftArgType, Device>::BlockAccessV2 &
+ TensorEvaluator<RightArgType, Device>::BlockAccessV2,
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
@@ -571,6 +692,20 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
TensorEvaluator<LeftArgType, Device>::Layout>
TensorBlock;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlockV2
+ LeftTensorBlock;
+ typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlockV2
+ RightTensorBlock;
+
+ typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
+ RightTensorBlock>
+ TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use right impl instead if right impl dimensions are known at compile time.
@@ -642,6 +777,13 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
right_block.block_strides(), right_block.data());
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ desc.DropDestinationBuffer();
+ return TensorBlockV2(m_leftImpl.blockV2(desc, scratch),
+ m_rightImpl.blockV2(desc, scratch), m_functor);
+ }
+
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
@@ -670,6 +812,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
internal::functor_traits<TernaryOp>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<Arg1Type, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -709,6 +852,10 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use arg2 or arg3 dimensions if they are known at compile time.
@@ -780,6 +927,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
PacketType<Scalar, Device>::HasBlend,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<IfArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -805,6 +953,10 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use then or else impl instead if they happen to be known at compile time.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index cf07656b3..a7cb8dc97 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -23,7 +23,7 @@ namespace Eigen {
*
* @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
* instructions)
- * @tparam Tileable can use block based tensor evaluation
+ * @tparam Tiling can use block based tensor evaluation
* (see TensorBlock.h)
*/
namespace internal {
@@ -76,8 +76,13 @@ struct ExpressionHasTensorBroadcastingOp<
* Default strategy: the expression is evaluated sequentially with a single cpu
* thread, without vectorization and block evaluation.
*/
+#if EIGEN_HAS_CXX11
template <typename Expression, typename Device, bool Vectorizable,
- bool Tileable>
+ TiledEvaluation Tiling>
+#else
+ template <typename Expression, typename Device, bool Vectorizable,
+ TiledEvaluation::TiledEvaluation Tiling>
+#endif
class TensorExecutor {
public:
typedef typename Expression::Index StorageIndex;
@@ -109,8 +114,8 @@ class TensorAsyncExecutor {};
* Process all the data with a single cpu thread, using vectorized instructions.
*/
template <typename Expression>
-class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
- /*Tileable*/ false> {
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
+ /*Tiling=*/TiledEvaluation::Off> {
public:
typedef typename Expression::Index StorageIndex;
@@ -152,7 +157,7 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
*/
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
- /*Tileable*/ true> {
+ /*Tiling=*/TiledEvaluation::Legacy> {
public:
typedef typename traits<Expression>::Scalar Scalar;
typedef typename remove_const<Scalar>::type ScalarNoConst;
@@ -176,8 +181,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
if (total_size < cache_size
&& !ExpressionHasTensorBroadcastingOp<Expression>::value) {
// TODO(andydavis) Reduce block management overhead for small tensors.
- internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
- /*Tileable*/ false>::run(expr, device);
+ internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tiling=*/TiledEvaluation::Off>::run(expr,device);
evaluator.cleanup();
return;
}
@@ -212,6 +216,70 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
};
/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+ /*Tiling=*/TiledEvaluation::On> {
+ public:
+ typedef typename traits<Expression>::Scalar Scalar;
+ typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+ typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+ typedef typename traits<Expression>::Index StorageIndex;
+
+ static const int NumDims = traits<Expression>::NumDimensions;
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE void run(const Expression& expr,
+ const DefaultDevice& device = DefaultDevice()) {
+ typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlock;
+ typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
+ typedef typename TensorBlock::Dimensions TensorBlockDimensions;
+
+ typedef internal::TensorBlockDescriptor<NumDims> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<DefaultDevice>
+ TensorBlockScratch;
+
+ Evaluator evaluator(expr, device);
+ Index total_size = array_prod(evaluator.dimensions());
+ Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+
+ // TODO(ezhulenev): Do not use tiling for small tensors?
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+
+ if (needs_assign) {
+ // Size tensor blocks to fit in cache (or requested target block size).
+ Index block_total_size = numext::mini(cache_size, total_size);
+ TensorBlockShapeType block_shape = kSkewedInnerDims;
+ // Query expression tree for desired block size/shape.
+ std::vector<TensorOpResourceRequirements> resources;
+ evaluator.getResourceRequirements(&resources);
+ MergeResourceRequirements(resources, &block_shape, &block_total_size);
+
+ TensorBlockMapper block_mapper(
+ TensorBlockDimensions(evaluator.dimensions()), block_shape,
+ block_total_size);
+ block_total_size = block_mapper.block_dims_total_size();
+
+ // Share scratch memory allocator between all blocks.
+ TensorBlockScratch scratch(device);
+
+ const StorageIndex total_block_count = block_mapper.total_block_count();
+ for (StorageIndex i = 0; i < total_block_count; ++i) {
+ TensorBlock block = block_mapper.GetBlockForIndex(i, NULL);
+
+ TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes());
+ evaluator.evalBlockV2(desc, scratch);
+ scratch.reset();
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+/**
* Multicore strategy: the index space is partitioned and each partition is
* executed on a single core.
*
@@ -256,10 +324,11 @@ struct TensorExecutorTilingContext {
};
// Computes a block evaluation parameters, and allocates temporary memory buffer
-// for blocks. See TensorExecutor/TensorAsyncExecutor (Tileable=true) below.
+// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
- const ThreadPoolDevice& device, const Evaluator& evaluator) {
+ const ThreadPoolDevice& device, const Evaluator& evaluator,
+ bool allocate_buffer = true) {
// Prefer blocks skewed toward inner dimension.
TensorBlockShapeType block_shape = kSkewedInnerDims;
Index block_total_size = 0;
@@ -284,7 +353,13 @@ TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
const size_t aligned_blocksize =
align *
divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
- void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
+
+ // TODO(ezhulenev): In new block evaluation framework there is no need for
+ // allocating temporary buffers, remove this after migration.
+ void* buf = NULL;
+ if (allocate_buffer) {
+ buf = device.allocate((num_threads + 1) * aligned_blocksize);
+ }
return {block_mapper, cost * block_size, buf, aligned_blocksize};
}
@@ -344,8 +419,8 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
}
};
-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
@@ -369,7 +444,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
};
template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+ /*Tiling=*/TiledEvaluation::Legacy> {
public:
typedef typename traits<Expression>::Index StorageIndex;
typedef typename traits<Expression>::Scalar Scalar;
@@ -387,11 +463,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
Index total_size = array_prod(evaluator.dimensions());
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
- if (total_size < cache_size
- && !ExpressionHasTensorBroadcastingOp<Expression>::value) {
+ if (total_size < cache_size &&
+ !ExpressionHasTensorBroadcastingOp<Expression>::value) {
// TODO(andydavis) Reduce block management overhead for small tensors.
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
- /*Tileable*/ false>::run(expr, device);
+ /*Tiling=*/TiledEvaluation::Off>::run(expr,
+ device);
evaluator.cleanup();
return;
}
@@ -419,6 +496,57 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
}
};
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+ /*Tiling=*/TiledEvaluation::On> {
+ public:
+ typedef typename traits<Expression>::Index IndexType;
+ typedef typename traits<Expression>::Scalar Scalar;
+ typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+ static const int NumDims = traits<Expression>::NumDimensions;
+
+ typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+ typedef TensorBlockMapper<ScalarNoConst, IndexType, NumDims,
+ Evaluator::Layout>
+ BlockMapper;
+ typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+ typedef internal::TensorBlockDescriptor<NumDims, IndexType>
+ TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
+ TensorBlockScratch;
+
+ static EIGEN_STRONG_INLINE void run(const Expression& expr,
+ const ThreadPoolDevice& device) {
+ Evaluator evaluator(expr, device);
+
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+ if (needs_assign) {
+ const TilingContext tiling =
+ internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
+ Vectorizable>(
+ device, evaluator, /*allocate_buffer=*/false);
+
+ auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
+ IndexType lastBlockIdx) {
+ TensorBlockScratch scratch(device);
+
+ for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
+ auto block = tiling.block_mapper.GetBlockForIndex(block_idx, nullptr);
+ TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes());
+ evaluator.evalBlockV2(desc, scratch);
+ scratch.reset();
+ }
+ };
+
+ device.parallelFor(tiling.block_mapper.total_block_count(), tiling.cost,
+ eval_block);
+ }
+ evaluator.cleanup();
+ }
+};
+
template <typename Expression, typename DoneCallback, bool Vectorizable,
bool Tileable>
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
@@ -562,8 +690,8 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU)
-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
static void run(const Expression& expr, const GpuDevice& device);
@@ -612,8 +740,8 @@ EigenMetaKernel(Evaluator eval, StorageIndex size) {
}
/*static*/
-template <typename Expression, bool Vectorizable, bool Tileable>
-EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
const Expression& expr, const GpuDevice& device) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
@@ -711,8 +839,8 @@ struct ExecExprFunctorKernel<Expr, false, Evaluator>
range_, vectorizable_threads_, evaluator) {}
};
-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tileable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index Index;
static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 8d1a6d9cc..7be007d94 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -138,12 +138,17 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
IsAligned = false,
PacketAccess = true,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
for (int i = 0; i < NumDims; ++i) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index 71ba56773..5f06c97ab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -42,12 +42,17 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
CoordAccess = true,
RawAccess = true
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
typedef Dimensions_ Dimensions;
static const std::size_t NumIndices = Dimensions::count;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index efd6a7557..8d45bd62a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -97,6 +97,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
IsAligned = true,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = true
@@ -109,8 +110,12 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
TensorBlockReader;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_op(op.expression()),
+ : m_impl(op.expression(), device), m_op(op.expression()),
m_device(device), m_buffer(NULL)
{ }
@@ -132,13 +137,13 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
#endif
typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
EvalTo evalToTmp(m_device.get(m_buffer), m_op);
- const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value;
- const bool Tile = TensorEvaluator<const ArgType, Device>::BlockAccess &&
- TensorEvaluator<const ArgType, Device>::PreferBlockAccess;
- internal::TensorExecutor<const EvalTo,
- typename internal::remove_const<Device>::type,
- Vectorize, Tile>::run(evalToTmp, m_device);
+ internal::TensorExecutor<
+ const EvalTo, typename internal::remove_const<Device>::type,
+ /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+ /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
+ run(evalToTmp, m_device);
+
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 772dbbe35..f33489a33 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -154,23 +154,61 @@ struct IsVectorizable<GpuDevice, Expression> {
TensorEvaluator<Expression, GpuDevice>::IsAligned;
};
+// Tiled evaluation strategy.
+#if !EIGEN_HAS_CXX11
+// To be able to use `TiledEvaluation::Off` in C++03 we need a namespace.
+// (Use of enumeration in a nested name specifier is a c++11 extension).
+namespace TiledEvaluation {
+#endif
+enum TiledEvaluation {
+ Off = 0, // tiled evaluation is not supported
+ On = 1, // still work in progress (see TensorBlockV2.h)
+ Legacy = 2 // soon to be deprecated (see TensorBock.h)
+};
+#if !EIGEN_HAS_CXX11
+} // namespace TiledEvaluation
+#endif
+
template <typename Device, typename Expression>
struct IsTileable {
+#if !EIGEN_HAS_CXX11
+ typedef TiledEvaluation::TiledEvaluation TiledEvaluation;
+#endif
+
// Check that block evaluation is supported and it's a preferred option (at
// least one sub-expression has much faster block evaluation, e.g.
// broadcasting).
- static const bool value = TensorEvaluator<Expression, Device>::BlockAccess &&
- TensorEvaluator<Expression, Device>::PreferBlockAccess;
+ static const bool BlockAccess =
+ TensorEvaluator<Expression, Device>::BlockAccess &&
+ TensorEvaluator<Expression, Device>::PreferBlockAccess;
+
+ static const bool BlockAccessV2 =
+ TensorEvaluator<Expression, Device>::BlockAccessV2 &&
+ TensorEvaluator<Expression, Device>::PreferBlockAccess;
+
+
+ static const TiledEvaluation value =
+ BlockAccessV2
+ ? TiledEvaluation::On
+ : (BlockAccess ? TiledEvaluation::Legacy : TiledEvaluation::Off);
};
+#if EIGEN_HAS_CXX11
+template <typename Expression, typename Device,
+ bool Vectorizable = IsVectorizable<Device, Expression>::value,
+ TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+class TensorExecutor;
+#else
template <typename Expression, typename Device,
bool Vectorizable = IsVectorizable<Device, Expression>::value,
- bool Tileable = IsTileable<Device, Expression>::value>
+ TiledEvaluation::TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
class TensorExecutor;
+#endif
+// TODO(ezhulenev): Add TiledEvaluation support to async executor.
template <typename Expression, typename Device, typename DoneCallback,
bool Vectorizable = IsVectorizable<Device, Expression>::value,
- bool Tileable = IsTileable<Device, Expression>::value>
+ bool Tileable = IsTileable<Device, Expression>::BlockAccess>
class TensorAsyncExecutor;
} // end namespace internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index b7ad33626..639e1dbb0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -94,6 +94,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = true,
+ BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -105,6 +106,10 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
typedef internal::TensorBlock<CoeffReturnType, Index, NumDims, Layout>
TensorBlock;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_device(device), m_generator(op.generator())
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 5ff67bdae..38bf80c5d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -232,6 +232,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = true,
+ BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
@@ -241,6 +242,10 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
typedef internal::TensorBlock<Scalar, Index, NumDims, Layout>
OutputTensorBlock;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
: m_device(device), m_impl(op.expression(), device)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index f8cda6574..e1df84a1d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -93,12 +93,17 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_strides(op.strides())
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 755170a34..f84edc6b3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -120,12 +120,17 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
@@ -195,11 +200,16 @@ template<typename ArgType, typename Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false // to be implemented
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 9ab1415ac..c8333e488 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -121,6 +121,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess &&
TensorEvaluator<ArgType, Device>::RawAccess &&
NumInputDims > 0 && NumOutputDims > 0,
+ BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -137,6 +138,10 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
Layout>
OutputTensorBlockReader;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dimensions(op.dimensions())
{
@@ -363,6 +368,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -378,6 +384,10 @@ template<typename NewDimensions, typename ArgType, typename Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(index);
@@ -532,6 +542,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
@@ -543,6 +554,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> TensorBlock;
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
{
@@ -813,6 +828,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
@@ -824,6 +840,10 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout> TensorBlock;
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
@@ -1002,11 +1022,16 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device),
m_device(device),
@@ -1179,12 +1204,17 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index e98382cc1..7b9ad7374 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -99,12 +99,17 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 47db839db..8158aa574 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -97,12 +97,17 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 5dddfcf85..cee7ae657 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -585,6 +585,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
IsAligned = false,
PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -598,6 +599,10 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
typedef internal::TensorBlock<ScalarNoConst, Index, NumInputDims, Layout>
InputTensorBlock;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool RunningFullReduction = (NumOutputDims==0);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index b92c9ffaf..87072006d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -142,12 +142,17 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = PlainObjectType::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===------------------------------------------------------------------===//
+
EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
}
@@ -374,12 +379,17 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorRef<Derived>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
: m_ref(m)
{ }
@@ -401,7 +411,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
}
EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); }
-
+
protected:
TensorRef<Derived> m_ref;
};
@@ -423,10 +433,15 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
{ }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 123675196..855d04eb7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -116,6 +116,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = true,
+ BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -128,6 +129,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
OutputTensorBlock;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
: m_impl(op.expression(), device),
@@ -400,6 +405,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -413,6 +419,10 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions() const { return this->m_dimensions; }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 44156126d..1e6fc93b1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -100,12 +100,17 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = true
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
: m_impl(op.expression(), device),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index ad6332179..5e8abad75 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -116,6 +116,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -129,6 +130,10 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
typedef internal::TensorBlockReader<ScalarNoConst, Index, NumDims, Layout>
TensorBlockReader;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
: m_device(device),
@@ -426,6 +431,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ BlockAccessV2 = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
@@ -438,6 +444,10 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
typedef internal::TensorBlockWriter<ScalarNoConst, Index, NumDims, Layout>
TensorBlockWriter;
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 3b1cbaabc..8c05704c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -115,12 +115,17 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index d04b1bea7..9dc7723cb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -98,12 +98,17 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_traceDim(1), m_device(device)
{
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 29a2d5538..292393e9a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -184,12 +184,17 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
+ BlockAccessV2 = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ //===--------------------------------------------------------------------===//
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
m_impl(op.expression(), device)
{