aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2019-10-02 12:44:06 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2019-10-02 12:44:06 -0700
commit60ae24ee1a6c16114de456d77fcfba6f5a1160ca (patch)
tree7b9d5463018055571a5050ca31a8d3df12a3e6fc /unsupported
parent6e40454a6e6cc57c07c7340148657c985ca6c928 (diff)
Add block evaluation to TensorReshaping/TensorCasting/TensorPadding/TensorSelect
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h275
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h55
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h138
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h11
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h95
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h254
-rw-r--r--unsupported/test/cxx11_tensor_block_eval.cpp180
-rw-r--r--unsupported/test/cxx11_tensor_executor.cpp15
9 files changed, 863 insertions, 168 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
index 25047b8e5..4d2145bf3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
@@ -12,13 +12,18 @@ namespace Eigen {
namespace internal {
// -------------------------------------------------------------------------- //
+// Forward declarations for templates defined below.
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIOV2;
+
+// -------------------------------------------------------------------------- //
// Helper function to compute strides for densely stored buffer of given
// dimensions.
// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
// this function instead everywhere.
template <int Layout, typename IndexType, int NumDims>
-EIGEN_STRONG_INLINE DSizes<IndexType, NumDims> strides(
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
const DSizes<IndexType, NumDims>& dimensions) {
DSizes<IndexType, NumDims> strides;
if (NumDims == 0) return strides;
@@ -40,6 +45,14 @@ EIGEN_STRONG_INLINE DSizes<IndexType, NumDims> strides(
return strides;
}
+#if EIGEN_HAS_CXX11
+template <int Layout, std::ptrdiff_t... Indices>
+EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
+ const Sizes<Indices...>& sizes) {
+ return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
+}
+#endif
+
// -------------------------------------------------------------------------- //
// TensorBlockDescriptor specifies a block offset within a tensor and the block
// sizes along each of the tensor dimensions.
@@ -155,6 +168,14 @@ class TensorBlockDescriptor {
DestinationBuffer(dst_base, m_dimensions, dst_strides, total_dst_bytes);
}
+ template <typename Scalar, typename DstStridesIndexType>
+ void AddDestinationBuffer(
+ Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides,
+ size_t total_dst_bytes) {
+ // DSizes constructor will do index type promotion if it's safe.
+ AddDestinationBuffer(dst_base, Dimensions(dst_strides), total_dst_bytes);
+ }
+
TensorBlockDescriptor& DropDestinationBuffer() {
m_destination.m_data = NULL;
return *this;
@@ -333,10 +354,11 @@ class TensorMaterializedBlock {
typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
#endif
public:
+ typedef DSizes<IndexType, NumDims> Dimensions;
typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
- const DSizes<IndexType, NumDims>& dimensions)
+ const Dimensions& dimensions)
: m_kind(kind),
m_data(data),
m_dimensions(dimensions),
@@ -352,18 +374,84 @@ class TensorMaterializedBlock {
// properly for TensorMap.
const XprType& expr() const { return m_expr; }
const Scalar* data() const { return m_data; }
-
void cleanup() {}
+ typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+
+ // Creates a materialized block for the given descriptor from a memory buffer.
+ template <typename DataDimensions, typename TensorBlockScratch>
+ EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
+ const Scalar* data, const DataDimensions& data_dims,
+ TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+ eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
+
+ // If a tensor block dimensions covers a contiguous block of the underlying
+ // memory, we can skip block buffer memory allocation, and construct a block
+ // from existing `data` memory buffer.
+ //
+ // Example: (RowMajor layout)
+ // data_dims: [11, 12, 13, 14]
+ // desc.dimensions(): [1, 1, 3, 14]
+ //
+ // In this case we can construct a TensorBlock starting at
+ // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
+ static const bool is_col_major = Layout == ColMajor;
+
+ // Find out how many inner dimensions have a matching size.
+ int num_matching_inner_dims = 0;
+ for (int i = 0; i < NumDims; ++i) {
+ int dim = is_col_major ? i : NumDims - i - 1;
+ if (data_dims[dim] != desc.dimensions()[dim]) break;
+ ++num_matching_inner_dims;
+ }
+
+ // All the outer dimensions must be of size `1`, except a single dimension
+ // before the matching inner dimension (`3` in the example above).
+ bool can_use_direct_access = true;
+ for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
+ int dim = is_col_major ? i : NumDims - i - 1;
+ if (desc.dimension(dim) != 1) {
+ can_use_direct_access = false;
+ break;
+ }
+ }
+
+ if (can_use_direct_access) {
+ const Scalar* block_start = data + desc.offset();
+ return TensorMaterializedBlock(TensorBlockKind::kView, block_start,
+ desc.dimensions());
+
+ } else {
+ void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+ Scalar* block_buffer = static_cast<Scalar*>(mem);
+
+ typedef internal::TensorBlockIOV2<Scalar, IndexType, NumDims, Layout>
+ TensorBlockIO;
+ typedef typename TensorBlockIO::Dst TensorBlockIODst;
+ typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+ TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
+ data, desc.offset());
+ TensorBlockIODst dst(desc.dimensions(),
+ internal::strides<Layout>(desc.dimensions()),
+ block_buffer);
+
+ TensorBlockIO::Copy(dst, src);
+
+ return TensorMaterializedBlock(TensorBlockKind::kMaterializedInScratch,
+ block_buffer, desc.dimensions());
+ }
+ }
+
private:
TensorBlockKind m_kind;
const Scalar* m_data;
- DSizes<IndexType, NumDims> m_dimensions;
+ Dimensions m_dimensions;
XprType m_expr;
};
// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression that applies UnaryOp
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
// functor to the blocks produced by the underlying Tensor expression.
template <typename UnaryOp, typename ArgTensorBlock>
@@ -398,7 +486,7 @@ class TensorCwiseUnaryBlock {
};
// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression that applies BinaryOp
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
// functor to the blocks produced by the underlying Tensor expression.
template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
@@ -447,6 +535,96 @@ class TensorCwiseBinaryBlock {
};
// -------------------------------------------------------------------------- //
+// TensorUnaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from a block of the underlying type (this is a
+// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
+
+template <typename BlockFactory, typename ArgTensorBlock>
+class TensorUnaryExprBlock {
+#if !EIGEN_HAS_CXX11
+ typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
+#endif
+
+ typedef typename ArgTensorBlock::XprType ArgXprType;
+ static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
+
+ public:
+ typedef typename conditional<
+ NoArgBlockAccess, void,
+ typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
+
+ typedef typename XprScalar<XprType>::type Scalar;
+
+ TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
+ const BlockFactory& factory)
+ : m_arg_block(arg_block), m_factory(factory) {}
+
+ TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+ XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
+ const Scalar* data() const { return NULL; }
+ void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+ ArgTensorBlock m_arg_block;
+ BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorTernaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from three blocks of the underlying type.
+
+template <typename BlockFactory, typename Arg1TensorBlock,
+ typename Arg2TensorBlock, typename Arg3TensorBlock>
+class TensorTernaryExprBlock {
+#if !EIGEN_HAS_CXX11
+ typedef internal::TensorBlockKind::TensorBlockKind TensorBlockKind;
+#endif
+
+ typedef typename Arg1TensorBlock::XprType Arg1XprType;
+ typedef typename Arg2TensorBlock::XprType Arg2XprType;
+ typedef typename Arg3TensorBlock::XprType Arg3XprType;
+
+ static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
+ internal::is_void<Arg2XprType>::value ||
+ internal::is_void<Arg3XprType>::value;
+
+ public:
+ typedef typename conditional<
+ NoArgBlockAccess, void,
+ typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
+ Arg3XprType>::type>::type XprType;
+
+ typedef typename XprScalar<XprType>::type Scalar;
+
+ TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
+ const Arg2TensorBlock& arg2_block,
+ const Arg3TensorBlock& arg3_block,
+ const BlockFactory& factory)
+ : m_arg1_block(arg1_block),
+ m_arg2_block(arg2_block),
+ m_arg3_block(arg3_block),
+ m_factory(factory) {}
+
+ TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+ XprType expr() const {
+ return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
+ m_arg3_block.expr());
+ }
+ const Scalar* data() const { return NULL; }
+ void cleanup() {
+ m_arg1_block.cleanup();
+ m_arg2_block.cleanup();
+ m_arg3_block.cleanup();
+ }
+
+ private:
+ Arg1TensorBlock m_arg1_block;
+ Arg2TensorBlock m_arg2_block;
+ Arg3TensorBlock m_arg3_block;
+ BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
// StridedLinearBufferCopy provides a method to copy data between two linear
// buffers with different strides, with optimized paths for scatter/gather.
@@ -547,7 +725,13 @@ class StridedLinearBufferCopy {
} else if (kind == FillLinear) {
// Fill `dst` with value at `*src`.
eigen_assert(src_stride == 0 && dst_stride == 1);
+ const IndexType unrolled_size = count - 4 * PacketSize;
Packet p = pload1<Packet>(src);
+ for (; i <= unrolled_size; i += 4 * PacketSize) {
+ for (int j = 0; j < 4; ++j) {
+ pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+ }
+ }
for (; i <= vectorized_size; i += PacketSize) {
pstoreu<Scalar, Packet>(dst + i, p);
}
@@ -809,15 +993,15 @@ class TensorBlockIOV2 {
// -------------------------------------------------------------------------- //
// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
-// a Tensor block defined by `desc`, backed by a memory buffer at `dst` address.
+// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
//
// Currently there is no way to write from a Tensor expression to a block of
// memory, if dimensions are reordered. If you need to do that, you should
// materialize a Tensor block expression into a memory buffer, and then use
// TensorBlockIO to copy data between two memory buffers with a custom
-// `dst->src` dimension map (see definition above).
+// `target->src` dimension map (see definition above).
//
-// Also currently the innermost dimension of `dst` must have a stride '1'
+// Also currently the innermost dimension of `target` must have a stride '1'
// (contiguous in memory). This restriction could be lifted with a `pscatter`,
// but in practice it's never needed, and there is a similar TensorBlockIO
// workaround for that.
@@ -842,18 +1026,18 @@ class TensorBlockAssignment {
template <bool Vectorizable, typename Evaluator>
struct InnerDimAssign {
- EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count,
+ EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
const Evaluator& eval,
IndexType eval_offset) {
for (IndexType i = 0; i < count; ++i) {
- dst[i] = eval.coeff(eval_offset + i);
+ target[i] = eval.coeff(eval_offset + i);
}
}
};
template <typename Evaluator>
struct InnerDimAssign<true, Evaluator> {
- EIGEN_ALWAYS_INLINE static void Run(Scalar* dst, IndexType count,
+ EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
const Evaluator& eval,
IndexType eval_offset) {
typedef typename packet_traits<Scalar>::type Packet;
@@ -866,26 +1050,29 @@ class TensorBlockAssignment {
for (int j = 0; j < 4; ++j) {
const IndexType idx = eval_offset + i + j * PacketSize;
Packet p = eval.template packet<Unaligned>(idx);
- pstoreu<Scalar>(dst + i + j * PacketSize, p);
+ pstoreu<Scalar>(target + i + j * PacketSize, p);
}
}
for (; i <= vectorized_size; i += PacketSize) {
Packet p = eval.template packet<Unaligned>(eval_offset + i);
- pstoreu<Scalar>(dst + i, p);
+ pstoreu<Scalar>(target + i, p);
}
for (; i < count; ++i) {
- dst[i] = eval.coeff(eval_offset + i);
+ target[i] = eval.coeff(eval_offset + i);
}
}
};
public:
- struct Dst {
- Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
- IndexType dst_offset = 0)
- : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
+ struct Target {
+ Target(const Dimensions& target_dims, const Dimensions& target_strides,
+ Scalar* target_data, IndexType target_offset = 0)
+ : dims(target_dims),
+ strides(target_strides),
+ data(target_data),
+ offset(target_offset) {}
Dimensions dims;
Dimensions strides;
@@ -893,34 +1080,50 @@ class TensorBlockAssignment {
IndexType offset;
};
+ static Target target(const Dimensions& target_dims,
+ const Dimensions& target_strides, Scalar* target_data,
+ IndexType target_offset = 0) {
+ return Target(target_dims, target_strides, target_data, target_offset);
+ }
+
+ template <typename TargetDimsIndexType, typename TargetStridesIndexType>
+ static Target target(
+ const DSizes<TargetDimsIndexType, NumDims>& target_dims,
+ const DSizes<TargetStridesIndexType, NumDims>& target_strides,
+ Scalar* target_data, IndexType target_offset = 0) {
+ // DSizes constructor will do index type promotion if it's safe.
+ return Target(Dimensions(target_dims), Dimensions(target_strides),
+ target_data, target_offset);
+ }
+
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
- const Dst& dst, const TensorBlockExpr& expr) {
+ const Target& target, const TensorBlockExpr& expr) {
// Prepare evaluator for block expression.
DefaultDevice default_device;
TensorBlockEvaluator eval(expr, default_device);
// Tensor block expression dimension should match destination dimensions.
- eigen_assert(dimensions_match(dst.dims, eval.dimensions()));
+ eigen_assert(dimensions_match(target.dims, eval.dimensions()));
static const int Layout = TensorBlockEvaluator::Layout;
static const bool is_col_major = Layout == ColMajor;
// Initialize output inner dimension size based on a layout.
- const IndexType output_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
+ const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
- IndexType output_inner_dim_size = dst.dims[inner_dim_idx];
+ IndexType output_inner_dim_size = target.dims[inner_dim_idx];
- // Dst inner dimension stride must be '1'.
- eigen_assert(dst.strides[inner_dim_idx] == 1);
+ // Target inner dimension stride must be '1'.
+ eigen_assert(target.strides[inner_dim_idx] == 1);
- // Squeeze multiple inner dims into one if they are contiguous in `dst`.
+ // Squeeze multiple inner dims into one if they are contiguous in `target`.
IndexType num_squeezed_dims = 0;
for (Index i = 1; i < NumDims; ++i) {
const Index dim = is_col_major ? i : NumDims - i - 1;
- const IndexType dst_stride = dst.strides[dim];
+ const IndexType target_stride = target.strides[dim];
- if (output_inner_dim_size == dst_stride) {
- output_inner_dim_size *= dst.dims[dim];
+ if (output_inner_dim_size == target_stride) {
+ output_inner_dim_size *= target.dims[dim];
num_squeezed_dims++;
} else {
break;
@@ -936,22 +1139,22 @@ class TensorBlockAssignment {
const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
it[idx].count = 0;
- it[idx].size = dst.dims[dim];
- it[idx].output_stride = dst.strides[dim];
+ it[idx].size = target.dims[dim];
+ it[idx].output_stride = target.strides[dim];
it[idx].output_span = it[i].output_stride * (it[i].size - 1);
idx++;
}
// We read block expression from the beginning, and start writing data to
- // `dst` at given offset.
+ // `target` at given offset.
IndexType input_offset = 0;
- IndexType output_offset = dst.offset;
+ IndexType output_offset = target.offset;
- // Iterate copying data from `eval` to `dst`.
+ // Iterate copying data from `eval` to `target`.
for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
- // Assign to `dst` at current offset.
+ // Assign to `target` at current offset.
InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
- TensorBlockEvaluator>::Run(dst.data + output_offset,
+ TensorBlockEvaluator>::Run(target.data + output_offset,
output_inner_dim_size, eval,
input_offset);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 9e4fae99a..dc9551d32 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -1247,10 +1247,10 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
TensorBlockAssignment;
- typename TensorBlockAssignment::Dst assignment_dst(
- input_block_sizes, input_block_strides, *materialized_input);
-
- TensorBlockAssignment::Run(assignment_dst, input_block.expr());
+ TensorBlockAssignment::Run(
+ TensorBlockAssignment::target(input_block_sizes, input_block_strides,
+ *materialized_input),
+ input_block.expr());
input_buffer = *materialized_input;
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index a8160e17e..cc3e67677 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -294,23 +294,45 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
typedef typename Storage::Type EvaluatorPointerType;
enum {
- IsAligned = false,
- PacketAccess =
+ IsAligned = false,
+ PacketAccess =
#ifndef EIGEN_USE_SYCL
- true,
+ true,
#else
- TensorEvaluator<ArgType, Device>::PacketAccess &
- internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+ TensorEvaluator<ArgType, Device>::PacketAccess &
+ internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
#endif
- BlockAccess = false,
- BlockAccessV2 = false,
- PreferBlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- RawAccess = false
+ BlockAccess = false,
+ BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
};
+ static const int NumDims = internal::array_size<Dimensions>::value;
+
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+ ArgTensorBlock;
+
+ struct TensorConversionOpBlockFactory {
+ template <typename ArgXprType>
+ struct XprType {
+ typedef TensorConversionOp<TargetType, const ArgXprType> type;
+ };
+
+ template <typename ArgXprType>
+ typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const {
+ return typename XprType<ArgXprType>::type(expr);
+ }
+ };
+
+ typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory,
+ ArgTensorBlock>
+ TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -376,6 +398,17 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ m_impl.getResourceRequirements(resources);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ return TensorBlockV2(m_impl.blockV2(desc, scratch),
+ TensorConversionOpBlockFactory());
+ }
+
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
/// required by sycl in order to extract the sycl accessor
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index c87075a72..b1d668744 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -176,11 +176,12 @@ struct TensorEvaluator
typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
Index>
TensorBlockAssign;
- typename TensorBlockAssign::Dst dst(desc.dimensions(),
- internal::strides<Layout>(m_dims),
- m_data, desc.offset());
- TensorBlockAssign::Run(dst, block.expr());
+ TensorBlockAssign::Run(
+ TensorBlockAssign::target(desc.dimensions(),
+ internal::strides<Layout>(m_dims), m_data,
+ desc.offset()),
+ block.expr());
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
@@ -349,62 +350,7 @@ struct TensorEvaluator<const Derived, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
assert(m_data != NULL);
-
- // TODO(ezhulenev): Move it to TensorBlockV2 and reuse in TensorForcedEval.
-
- // If a tensor block descriptor covers a contiguous block of the underlying
- // memory, we can skip block buffer memory allocation, and construct a block
- // from existing `m_data` memory buffer.
- //
- // Example: (RowMajor layout)
- // m_dims: [11, 12, 13, 14]
- // desc.dimensions(): [1, 1, 3, 14]
- //
- // In this case we can construct a TensorBlock starting at
- // `m_data + desc.offset()`, with a `desc.dimensions()` block sizes.
-
- static const bool
- is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
-
- // Find out how many inner dimensions have a matching size.
- int num_matching_inner_dims = 0;
- for (int i = 0; i < NumCoords; ++i) {
- int dim = is_col_major ? i : NumCoords - i - 1;
- if (m_dims[dim] != desc.dimensions()[dim]) break;
- ++num_matching_inner_dims;
- }
-
- // All the outer dimensions must be of size `1`, except a single dimension
- // before the matching inner dimension (`3` in the example above).
- bool can_use_direct_access = true;
- for (int i = num_matching_inner_dims + 1; i < NumCoords; ++i) {
- int dim = is_col_major ? i : NumCoords - i - 1;
- if (desc.dimension(dim) != 1) {
- can_use_direct_access = false;
- break;
- }
- }
-
- if (can_use_direct_access) {
- EvaluatorPointerType block_start = m_data + desc.offset();
- return TensorBlockV2(internal::TensorBlockKind::kView, block_start,
- desc.dimensions());
-
- } else {
- void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
- ScalarNoConst* block_buffer = static_cast<ScalarNoConst*>(mem);
-
- TensorBlockIOSrc src(internal::strides<Layout>(m_dims), m_data,
- desc.offset());
- TensorBlockIODst dst(desc.dimensions(),
- internal::strides<Layout>(desc.dimensions()),
- block_buffer);
-
- TensorBlockIO::Copy(dst, src);
-
- return TensorBlockV2(internal::TensorBlockKind::kMaterializedInScratch,
- block_buffer, desc.dimensions());
- }
+ return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
@@ -923,15 +869,21 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
typedef typename XprType::Scalar Scalar;
enum {
- IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
- PacketType<Scalar, Device>::HasBlend,
- BlockAccess = false,
- BlockAccessV2 = false,
- PreferBlockAccess = false,
- Layout = TensorEvaluator<IfArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
+ IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned &
+ TensorEvaluator<ElseArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess &
+ TensorEvaluator<ElseArgType, Device>::PacketAccess &
+ PacketType<Scalar, Device>::HasBlend,
+ BlockAccess = false,
+ BlockAccessV2 = TensorEvaluator<IfArgType, Device>::BlockAccessV2 &&
+ TensorEvaluator<ThenArgType, Device>::BlockAccessV2 &&
+ TensorEvaluator<ElseArgType, Device>::BlockAccessV2,
+ PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
+ TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
+ TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<IfArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -953,8 +905,36 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
+ static const int NumDims = internal::array_size<Dimensions>::value;
+
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlockV2
+ IfArgTensorBlock;
+ typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlockV2
+ ThenArgTensorBlock;
+ typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlockV2
+ ElseArgTensorBlock;
+
+ struct TensorSelectOpBlockFactory {
+ template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+ struct XprType {
+ typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type;
+ };
+
+ template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+ typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr(
+ const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const {
+ return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr);
+ }
+ };
+
+ typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory,
+ IfArgTensorBlock, ThenArgTensorBlock,
+ ElseArgTensorBlock>
+ TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -1000,6 +980,24 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
.cwiseMax(m_elseImpl.costPerCoeff(vectorized));
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ m_condImpl.getResourceRequirements(resources);
+ m_thenImpl.getResourceRequirements(resources);
+ m_elseImpl.getResourceRequirements(resources);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ // It's unsafe to pass destination buffer to underlying expressions, because
+ // output might be aliased with one of the inputs.
+ desc.DropDestinationBuffer();
+
+ return TensorBlockV2(
+ m_condImpl.blockV2(desc, scratch), m_thenImpl.blockV2(desc, scratch),
+ m_elseImpl.blockV2(desc, scratch), TensorSelectOpBlockFactory());
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index be8f3a734..2a3398d67 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -324,6 +324,17 @@ struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
}
};
+template <typename FirstType, typename... OtherTypes>
+std::ostream& operator<<(std::ostream& os,
+ const IndexList<FirstType, OtherTypes...>& dims) {
+ os << "[";
+ for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) {
+ if (i > 0) os << ", ";
+ os << dims[i];
+ }
+ os << "]";
+ return os;
+}
template<typename FirstType, typename... OtherTypes>
constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index c8333e488..5d4b0f061 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -113,6 +113,25 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
static const int NumOutputDims = internal::array_size<Dimensions>::value;
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ enum ReshapingKind {
+ // We do not use layout information to determine reshaping kind.
+ // Depending on the layout `N` can be inner or outer dimension.
+ OneByN = 0, // expr.reshape(1, N)
+ NByOne = 1, // expr.reshape(N, 1)
+ Runtime = 2 // Reshape dimensions are dynamic (specified at runtime).
+ };
+
+ // clang-format off
+ static const ReshapingKind kind =
+#if defined(EIGEN_HAS_INDEX_LIST)
+ (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN
+ : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne
+ : Runtime;
+#else
+ Runtime;
+#endif
+ // clang-format on
+
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
@@ -121,8 +140,12 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess &&
TensorEvaluator<ArgType, Device>::RawAccess &&
NumInputDims > 0 && NumOutputDims > 0,
- BlockAccessV2 = false,
- PreferBlockAccess = true,
+ // For trivial reshapes with raw access to underlying data we will provide
+ // zero overhead block access.
+ // TODO(ezhulenev): Consider adding block access without raw access?
+ BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess &&
+ NumInputDims > 0 && NumOutputDims > 0,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
@@ -139,7 +162,13 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
OutputTensorBlockReader;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef
+ typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims,
+ Layout, Index>
+ TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -199,8 +228,9 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
- std::vector<internal::TensorOpResourceRequirements>* resources) const {
- m_impl.getResourceRequirements(resources);
+ std::vector<internal::TensorOpResourceRequirements>*) const {
+ // TODO(ezhulenev): If we'll ever support block evaluation without raw
+ // access we'll need to get requirements from `m_impl`.
}
// required in block(OutputTensorBlock* output_block) const
@@ -334,6 +364,26 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ eigen_assert(m_impl.data() != NULL);
+ eigen_assert((kind == Runtime) ||
+ (kind == OneByN && desc.dimensions()[0] == 1) ||
+ (kind == NByOne && desc.dimensions()[1] == 1));
+
+ if (kind == OneByN || kind == NByOne) {
+ // We can guarantee at compile time that block is just a contiguous slice
+ // of the underlying expression memory buffer.
+ return TensorBlockV2(internal::TensorBlockKind::kView,
+ m_impl.data() + desc.offset(), desc.dimensions());
+ } else {
+ // This will do additional runtime checks, and in the end it might be also
+ // a view, or it might be a block materialized in the temporary buffer.
+ return TensorBlockV2::materialize(m_impl.data(), m_dimensions, desc,
+ scratch);
+ }
+ }
+
EIGEN_DEVICE_FUNC typename Storage::Type data() const {
return constCast(m_impl.data());
}
@@ -365,14 +415,14 @@ template<typename NewDimensions, typename ArgType, typename Device>
typedef NewDimensions Dimensions;
enum {
- IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- BlockAccessV2 = false,
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
PreferBlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -385,18 +435,37 @@ template<typename NewDimensions, typename ArgType, typename Device>
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index>
+ TensorBlockDesc;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(index);
}
+
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
this->m_impl.template writePacket<StoreMode>(index, x);
}
+
+ template <typename TensorBlock>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
+ const TensorBlockDesc& desc, const TensorBlock& block) {
+ assert(this->m_impl.data() != NULL);
+
+ typedef typename TensorBlock::XprType TensorBlockExpr;
+ typedef internal::TensorBlockAssignment<
+ Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index>
+ TensorBlockAssign;
+
+ TensorBlockAssign::Run(
+ TensorBlockAssign::target(desc.dimensions(),
+ internal::strides<Layout>(this->dimensions()),
+ this->m_impl.data(), desc.offset()),
+ block.expr());
+ }
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 7b9ad7374..be2449ebd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -96,22 +96,29 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
typedef typename Storage::Type EvaluatorPointerType;
enum {
- IsAligned = true,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- BlockAccessV2 = false,
- PreferBlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = true,
- RawAccess = false
+ IsAligned = true,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = true,
+ RawAccess = false
};
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+ Layout, Index>
+ TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
+ : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
{
// The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
// to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
@@ -212,6 +219,214 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
return cost;
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+ 1, m_device.lastLevelCacheSize() / sizeof(Scalar));
+ resources->push_back(internal::TensorOpResourceRequirements(
+ internal::kSkewedInnerDims, block_total_size_max));
+
+ m_impl.getResourceRequirements(resources);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ eigen_assert(m_impl.data() != NULL);
+
+ // Check if we can reuse `desc` destination, or allocate new scratch buffer.
+ ScalarNoConst* materialized_output =
+ desc.template destination<ScalarNoConst, Layout>();
+
+ bool materialized_in_output;
+ if (materialized_output != NULL) {
+ desc.DropDestinationBuffer();
+ materialized_in_output = true;
+
+ } else {
+ const size_t materialized_output_size = desc.size() * sizeof(Scalar);
+ void* output_scratch_mem = scratch.allocate(materialized_output_size);
+ materialized_output = static_cast<ScalarNoConst*>(output_scratch_mem);
+ materialized_in_output = false;
+ }
+
+ static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
+
+ Index offset = desc.offset();
+
+ // Compute offsets in the output tensor corresponding to the desc.offset().
+ DSizes<Index, NumDims> output_offsets;
+ for (int i = NumDims - 1; i > 0; --i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ const int stride_dim = IsColMajor ? dim : dim + 1;
+ output_offsets[dim] = offset / m_outputStrides[stride_dim];
+ offset -= output_offsets[dim] * m_outputStrides[stride_dim];
+ }
+ output_offsets[IsColMajor ? 0 : NumDims - 1] = offset;
+
+ // Offsets in the input corresponding to output offsets.
+ DSizes<Index, NumDims> input_offsets = output_offsets;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
+ }
+
+ // Compute offset in the input buffer (at this point it might be illegal and
+ // point outside of the input buffer, because we don't check for negative
+ // offsets, it will be autocorrected in the block iteration loop below).
+ Index input_offset = 0;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ input_offset += input_offsets[dim] * m_inputStrides[dim];
+ }
+
+ // Destination buffer and scratch buffer both indexed from 0 and have the
+ // same dimensions as the requested block (for destination buffer this
+ // property is guaranteed by `desc.destination()`).
+ Index output_offset = 0;
+ const DSizes<Index, NumDims> output_strides =
+ internal::strides<Layout>(desc.dimensions());
+
+ // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
+ // dimensions, skipping innermost dimension. In theory it should be possible
+ // to squeeze matching innermost dimensions, however in practice that did
+ // not show any improvements in benchmarks. Also in practice first outer
+ // dimension usually has padding, and will prevent squeezing.
+
+ // Initialize output block iterator state. Dimension in this array are
+ // always in inner_most -> outer_most order (col major layout).
+ array<BlockIteratorState, NumDims - 1> it;
+ for (Index i = 0; i < NumDims - 1; ++i) {
+ const Index dim = IsColMajor ? i + 1 : NumDims - i - 2;
+ it[i].count = 0;
+ it[i].size = desc.dimension(dim);
+
+ it[i].input_stride = m_inputStrides[dim];
+ it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+ it[i].output_stride = output_strides[dim];
+ it[i].output_span = it[i].output_stride * (it[i].size - 1);
+ }
+
+ const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
+
+ // Total output size.
+ const Index output_size = desc.size();
+
+ // We will fill inner dimension of this size in the output. It might be
+ // larger than the inner dimension in the input, so we might have to pad
+ // before/after we copy values from the input inner dimension.
+ const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
+
+ // How many values to fill with padding BEFORE reading from the input inner
+ // dimension.
+ const Index output_inner_pad_before_size =
+ input_offsets[inner_dim_idx] < 0
+ ? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
+ output_inner_dim_size)
+ : 0;
+
+ // How many values we can actually copy from the input inner dimension.
+ const Index output_inner_copy_size = numext::mini(
+ // Want to copy from input.
+ (output_inner_dim_size - output_inner_pad_before_size),
+ // Can copy from input.
+ (static_cast<Index>(m_impl.dimensions()[inner_dim_idx]) -
+ numext::maxi(input_offsets[inner_dim_idx], Index(0))));
+
+ // How many values to fill with padding AFTER reading from the input inner
+ // dimension.
+ const Index output_inner_pad_after_size =
+ (output_inner_dim_size - output_inner_copy_size -
+ output_inner_pad_before_size);
+
+ // Sanity check, sum of all sizes must be equal to the output size.
+ eigen_assert(output_inner_dim_size ==
+ (output_inner_pad_before_size + output_inner_copy_size +
+ output_inner_pad_after_size));
+
+ // Keep track of current coordinates and padding in the output.
+ DSizes<Index, NumDims> output_coord = output_offsets;
+ DSizes<Index, NumDims> output_padded;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+ }
+
+ typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
+
+ // Iterate copying data from `m_impl.data()` to the output buffer.
+ for (Index size = 0; size < output_size; size += output_inner_dim_size) {
+ // Detect if we are in the padded region (exclude innermost dimension).
+ bool is_padded = false;
+ for (int j = 1; j < NumDims; ++j) {
+ const int dim = IsColMajor ? j : NumDims - j - 1;
+ is_padded = output_padded[dim];
+ if (is_padded) break;
+ }
+
+ if (is_padded) {
+ // Fill with padding value.
+ LinCopy::template Run<LinCopy::Kind::FillLinear>(
+ typename LinCopy::Dst(output_offset, 1, materialized_output),
+ typename LinCopy::Src(0, 0, &m_paddingValue),
+ output_inner_dim_size);
+
+ } else {
+ { // Fill with padding before copying from input inner dimension.
+ const Index out = output_offset;
+
+ LinCopy::template Run<LinCopy::Kind::FillLinear>(
+ typename LinCopy::Dst(out, 1, materialized_output),
+ typename LinCopy::Src(0, 0, &m_paddingValue),
+ output_inner_pad_before_size);
+ }
+
+ { // Copy data from input inner dimension.
+ const Index out = output_offset + output_inner_pad_before_size;
+ const Index in = input_offset + output_inner_pad_before_size;
+
+ LinCopy::template Run<LinCopy::Kind::Linear>(
+ typename LinCopy::Dst(out, 1, materialized_output),
+ typename LinCopy::Src(in, 1, m_impl.data()),
+ output_inner_copy_size);
+ }
+
+ { // Fill with padding after copying from input inner dimension.
+ const Index out = output_offset + output_inner_pad_before_size +
+ output_inner_copy_size;
+
+ LinCopy::template Run<LinCopy::Kind::FillLinear>(
+ typename LinCopy::Dst(out, 1, materialized_output),
+ typename LinCopy::Src(0, 0, &m_paddingValue),
+ output_inner_pad_after_size);
+ }
+ }
+
+ for (int j = 0; j < NumDims - 1; ++j) {
+ const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
+
+ if (++it[j].count < it[j].size) {
+ input_offset += it[j].input_stride;
+ output_offset += it[j].output_stride;
+ output_coord[dim] += 1;
+ output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+ break;
+ }
+ it[j].count = 0;
+ input_offset -= it[j].input_span;
+ output_offset -= it[j].output_span;
+ output_coord[dim] -= it[j].size - 1;
+ output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+ }
+ }
+
+ return TensorBlockV2(materialized_in_output
+ ? internal::TensorBlockKind::kMaterializedInOutput
+ : internal::TensorBlockKind::kMaterializedInScratch,
+ materialized_output,
+ desc.dimensions());
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
@@ -222,6 +437,23 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
#endif
private:
+ struct BlockIteratorState {
+ BlockIteratorState()
+ : count(0),
+ size(0),
+ input_stride(0),
+ input_span(0),
+ output_stride(0),
+ output_span(0) {}
+
+ Index count;
+ Index size;
+ Index input_stride;
+ Index input_span;
+ Index output_stride;
+ Index output_span;
+ };
+
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
Index index, int dim_index) const {
#if defined(EIGEN_HAS_INDEX_LIST)
@@ -410,6 +642,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
PaddingDimensions m_padding;
Scalar m_paddingValue;
+
+ const Device EIGEN_DEVICE_REF m_device;
};
diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
index e85b81141..ff98e170d 100644
--- a/unsupported/test/cxx11_tensor_block_eval.cpp
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@@ -104,6 +104,17 @@ static TensorBlockParams<NumDims> FixedSizeBlock(DSizes<Index, NumDims> dims) {
return {offsets, dims, TensorBlockDescriptor<NumDims, Index>(0, dims)};
}
+inline Eigen::IndexList<int, Eigen::type2index<1>> NByOne(int n) {
+ Eigen::IndexList<int, Eigen::type2index<1>> ret;
+ ret.set(0, n);
+ return ret;
+}
+inline Eigen::IndexList<Eigen::type2index<1>, int> OneByM(int m) {
+ Eigen::IndexList<Eigen::type2index<1>, int> ret;
+ ret.set(1, m);
+ return ret;
+}
+
// -------------------------------------------------------------------------- //
// Verify that block expression evaluation produces the same result as a
// TensorSliceOp (reading a tensor block is same to taking a tensor slice).
@@ -174,7 +185,7 @@ static void test_eval_tensor_block() {
// Identity tensor expression transformation.
VerifyBlockEvaluator<T, NumDims, Layout>(
- input, [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
+ input, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
}
template <typename T, int NumDims, int Layout>
@@ -184,7 +195,7 @@ static void test_eval_tensor_unary_expr_block() {
input.setRandom();
VerifyBlockEvaluator<T, NumDims, Layout>(
- input.square(), [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
+ input.square(), [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
}
template <typename T, int NumDims, int Layout>
@@ -195,7 +206,7 @@ static void test_eval_tensor_binary_expr_block() {
rhs.setRandom();
VerifyBlockEvaluator<T, NumDims, Layout>(
- lhs + rhs, [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
+ lhs + rhs, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
}
template <typename T, int NumDims, int Layout>
@@ -207,7 +218,7 @@ static void test_eval_tensor_binary_with_unary_expr_block() {
VerifyBlockEvaluator<T, NumDims, Layout>(
(lhs.square() + rhs.square()).sqrt(),
- [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
+ [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
}
template <typename T, int NumDims, int Layout>
@@ -236,6 +247,114 @@ static void test_eval_tensor_broadcast() {
[&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
}
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_reshape() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
+
+ DSizes<Index, NumDims> shuffled = dims;
+ std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.reshape(shuffled),
+ [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.reshape(shuffled),
+ [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_reshape_with_bcast() {
+ Index dim = internal::random<Index>(1, 100);
+
+ Tensor<T, 2, Layout> lhs(1, dim);
+ Tensor<T, 2, Layout> rhs(dim, 1);
+ lhs.setRandom();
+ rhs.setRandom();
+
+ auto reshapeLhs = NByOne(dim);
+ auto reshapeRhs = OneByM(dim);
+
+ auto bcastLhs = OneByM(dim);
+ auto bcastRhs = NByOne(dim);
+
+ DSizes<Index, 2> dims(dim, dim);
+
+ VerifyBlockEvaluator<T, 2, Layout>(
+ lhs.reshape(reshapeLhs).broadcast(bcastLhs) +
+ rhs.reshape(reshapeRhs).broadcast(bcastRhs),
+ [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_cast() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.template cast<int>().template cast<T>(),
+ [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_select() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> lhs(dims);
+ Tensor<T, NumDims, Layout> rhs(dims);
+ Tensor<bool, NumDims, Layout> cond(dims);
+ lhs.setRandom();
+ rhs.setRandom();
+ cond.setRandom();
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(cond.select(lhs, rhs), [&dims]() {
+ return RandomBlock<Layout>(dims, 1, 20);
+ });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_padding() {
+ const int inner_dim = Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ DSizes<Index, NumDims> pad_before = RandomDims<NumDims>(0, 4);
+ DSizes<Index, NumDims> pad_after = RandomDims<NumDims>(0, 4);
+ array<std::pair<Index, Index>, NumDims> paddings;
+ for (int i = 0; i < NumDims; ++i) {
+ paddings[i] = std::make_pair(pad_before[i], pad_after[i]);
+ }
+
+ // Test squeezing reads from inner dim.
+ if (internal::random<bool>()) {
+ pad_before[inner_dim] = 0;
+ pad_after[inner_dim] = 0;
+ paddings[inner_dim] = std::make_pair(0, 0);
+ }
+
+ DSizes<Index, NumDims> padded_dims;
+ for (int i = 0; i < NumDims; ++i) {
+ padded_dims[i] = dims[i] + pad_before[i] + pad_after[i];
+ }
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.pad(paddings),
+ [&padded_dims]() { return FixedSizeBlock(padded_dims); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.pad(paddings),
+ [&padded_dims]() { return RandomBlock<Layout>(padded_dims, 1, 10); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.pad(paddings),
+ [&padded_dims]() { return SkewedInnerBlock<Layout>(padded_dims); });
+}
+
// -------------------------------------------------------------------------- //
// Verify that assigning block to a Tensor expression produces the same result
// as an assignment to TensorSliceOp (writing a block is is identical to
@@ -300,7 +419,7 @@ static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
// -------------------------------------------------------------------------- //
template <typename T, int NumDims, int Layout>
-static void test_assign_tensor_block() {
+static void test_assign_to_tensor() {
DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
Tensor<T, NumDims, Layout> tensor(dims);
@@ -312,11 +431,32 @@ static void test_assign_tensor_block() {
tensor, map, [&dims]() { return FixedSizeBlock(dims); });
}
-// -------------------------------------------------------------------------- //
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_reshape() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> tensor(dims);
+
+ TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
-//#define CALL_SUBTESTS(NAME) CALL_SUBTEST((NAME<float, 2, RowMajor>()))
+ DSizes<Index, NumDims> shuffled = dims;
+ std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map.reshape(shuffled),
+ [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
-#define CALL_SUBTESTS(NAME) \
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map.reshape(shuffled),
+ [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map.reshape(shuffled),
+ [&shuffled]() { return FixedSizeBlock(shuffled); });
+}
+
+// -------------------------------------------------------------------------- //
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS(NAME) \
CALL_SUBTEST((NAME<float, 1, RowMajor>())); \
CALL_SUBTEST((NAME<float, 2, RowMajor>())); \
CALL_SUBTEST((NAME<float, 4, RowMajor>())); \
@@ -326,14 +466,24 @@ static void test_assign_tensor_block() {
CALL_SUBTEST((NAME<float, 4, ColMajor>())); \
CALL_SUBTEST((NAME<float, 5, ColMajor>()))
+#define CALL_SUBTESTS_LAYOUTS(NAME) \
+ CALL_SUBTEST((NAME<float, RowMajor>())); \
+ CALL_SUBTEST((NAME<float, ColMajor>()))
+
EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
// clang-format off
- CALL_SUBTESTS(test_eval_tensor_block);
- CALL_SUBTESTS(test_eval_tensor_unary_expr_block);
- CALL_SUBTESTS(test_eval_tensor_binary_expr_block);
- CALL_SUBTESTS(test_eval_tensor_binary_with_unary_expr_block);
- CALL_SUBTESTS(test_eval_tensor_broadcast);
-
- CALL_SUBTESTS(test_assign_tensor_block);
+ CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_block);
+ CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_unary_expr_block);
+ CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_binary_expr_block);
+ CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_binary_with_unary_expr_block);
+ CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_broadcast);
+ CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_reshape);
+ CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_cast);
+ CALL_SUBTESTS_DIMS_LAYOUTS(test_eval_tensor_padding);
+
+ CALL_SUBTESTS_LAYOUTS(test_eval_tensor_reshape_with_bcast);
+
+ CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor);
+ CALL_SUBTESTS_DIMS_LAYOUTS(test_assign_to_tensor_reshape);
// clang-format on
}
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index c233fe30f..9094b6507 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -582,11 +582,10 @@ static void test_async_execute_unary_expr(Device d)
Eigen::Barrier done(1);
auto on_done = [&done]() { done.Notify(); };
- static const bool TilingOn = Tiling == TiledEvaluation::Off ? false : true;
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using DoneCallback = decltype(on_done);
using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
- Vectorizable, TilingOn>;
+ Vectorizable, Tiling>;
Executor::runAsync(Assign(dst, expr), d, on_done);
done.Wait();
@@ -619,11 +618,10 @@ static void test_async_execute_binary_expr(Device d)
Eigen::Barrier done(1);
auto on_done = [&done]() { done.Notify(); };
- static const bool TilingOn = Tiling == TiledEvaluation::Off ? false : true;
using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
using DoneCallback = decltype(on_done);
using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
- Vectorizable, TilingOn>;
+ Vectorizable, Tiling>;
Executor::runAsync(Assign(dst, expr), d, on_done);
done.Wait();
@@ -737,10 +735,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 4);
CALL_SUBTEST_COMBINATIONS_V1(8, test_execute_reduction, float, 5);
- CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 2);
- CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 3);
- CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 4);
- CALL_SUBTEST_COMBINATIONS_V1(9, test_execute_reshape, float, 5);
+ CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 2);
+ CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 3);
+ CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 4);
+ CALL_SUBTEST_COMBINATIONS_V2(9, test_execute_reshape, float, 5);
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 2);
CALL_SUBTEST_COMBINATIONS_V1(10, test_execute_slice_rvalue, float, 3);
@@ -779,4 +777,3 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
// Force CMake to split this test.
// EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
}
-