aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2019-10-14 14:31:59 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2019-10-14 14:31:59 -0700
commitd380c23b2cc0b02e10819e779c73cde2c62603b2 (patch)
tree09d204c87ed6a9f55aa0c6305d7e4199a71dbf8a /unsupported/Eigen/CXX11/src
parent39fb9eeccf2e79542acad9bbf5196e462c1b2cee (diff)
Block evaluation for TensorGenerator/TensorReverse/TensorShuffling
Diffstat (limited to 'unsupported/Eigen/CXX11/src')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h112
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h15
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h13
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h3
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h33
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h7
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h147
14 files changed, 273 insertions, 84 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 29aa7a97e..f2b9389c8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -242,9 +242,8 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
(internal::array_prod(m_leftImpl.dimensions()) * sizeof(Scalar)));
}
- RightTensorBlock block = m_rightImpl.blockV2(desc, scratch);
- // If block was evaluated into a destination, there is no need to do
- // assignment.
+ RightTensorBlock block = m_rightImpl.blockV2(desc, scratch, /*root_of_expr_ast=*/true);
+ // If block was evaluated into a destination, there is no need to do assignment.
if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
m_leftImpl.writeBlockV2(desc, block);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
index b8c592543..099d7cd57 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
@@ -45,6 +45,12 @@ EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
return strides;
}
+template<int Layout, typename IndexType, size_t NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+ const Eigen::array<IndexType, NumDims>& dimensions) {
+ return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
+}
+
#if EIGEN_HAS_CXX11
template <int Layout, std::ptrdiff_t... Indices>
EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
@@ -78,23 +84,24 @@ class TensorBlockDescriptor {
return static_cast<Scalar*>(m_data);
}
- private:
- friend class TensorBlockDescriptor;
-
- DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}
+ template <typename Scalar>
+ Dimensions dimensions() const {
+ Dimensions dimensions;
+ for (int i = 0; i < NumDims; ++i) {
+ eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0);
+ dimensions[i] = m_dimensions[i] / sizeof(Scalar);
+ }
+ return dimensions;
+ }
template <typename Scalar>
- DestinationBuffer(Scalar* data, const Dimensions& dimensions,
- const Dimensions& strides, size_t total_dst_bytes)
- : m_data(static_cast<void*>(data)),
- m_dimensions(dimensions),
- m_strides(strides),
- m_total_dst_bytes(total_dst_bytes) {
- // TODO(ezhulenev): Benchmark template meta-unroll for this loop.
+ Dimensions strides() const {
+ Dimensions strides;
for (int i = 0; i < NumDims; ++i) {
- m_dimensions[i] *= sizeof(Scalar);
- m_strides[i] *= sizeof(Scalar);
+ eigen_assert(m_strides[i] % sizeof(Scalar) == 0);
+ strides[i] = m_strides[i] / sizeof(Scalar);
}
+ return strides;
}
// Returns true if the tensor block corresponding to `desc` fits into the
@@ -109,29 +116,34 @@ class TensorBlockDescriptor {
if (!dimensions_match(desc_dims, dst_dims)) return false;
const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
- const Dimensions& dst_strides = internal::strides<Layout>(dst_dims);
+ const Dimensions& dst_strides = strides<Scalar>();
- return dimensions_match(desc_strides, dst_strides);
- }
-
- template <typename Scalar>
- Dimensions dimensions() const {
- Dimensions dimensions;
+ // Compare strides ignoring dimensions of size `1`.
for (int i = 0; i < NumDims; ++i) {
- eigen_assert(m_dimensions[i] % sizeof(Scalar) == 0);
- dimensions[i] = m_dimensions[i] / sizeof(Scalar);
+ if (desc_dims[i] == 1) continue;
+ if (desc_strides[i] != dst_strides[i]) return false;
}
- return dimensions;
+
+ return true;
}
+ private:
+ friend class TensorBlockDescriptor;
+
+ DestinationBuffer() : m_data(NULL), m_total_dst_bytes(0) {}
+
template <typename Scalar>
- Dimensions strides() const {
- Dimensions strides;
+ DestinationBuffer(Scalar* data, const Dimensions& dimensions,
+ const Dimensions& strides, size_t total_dst_bytes)
+ : m_data(static_cast<void*>(data)),
+ m_dimensions(dimensions),
+ m_strides(strides),
+ m_total_dst_bytes(total_dst_bytes) {
+ // TODO(ezhulenev): Benchmark template meta-unroll for this loop.
for (int i = 0; i < NumDims; ++i) {
- eigen_assert(m_strides[i] % sizeof(Scalar) == 0);
- strides[i] = m_strides[i] / sizeof(Scalar);
+ m_dimensions[i] *= sizeof(Scalar);
+ m_strides[i] *= sizeof(Scalar);
}
- return strides;
}
void* m_data;
@@ -181,6 +193,12 @@ class TensorBlockDescriptor {
return *this;
}
+ bool HasDestinationBuffer() const { return m_destination.m_data != NULL; }
+
+ const DestinationBuffer& GetDestinationBuffer() const {
+ return m_destination;
+ }
+
// Returns a non-nullptr pointer to a destination buffer memory if this
// block has a contiguous destination buffer.
template <typename Scalar, int Layout>
@@ -191,6 +209,11 @@ class TensorBlockDescriptor {
return NULL;
}
+ // Returns a copy of `*this` with updated offset.
+ TensorBlockDescriptor WithOffset(IndexType offset) const {
+ return TensorBlockDescriptor(offset, m_dimensions, m_destination);
+ }
+
private:
// Offset and dimensions are immutable after construction. Block descriptor
// can only be mutated by adding or dropping destination.
@@ -294,18 +317,12 @@ enum TensorBlockKind {
// Tensor block that was materialized directly into the final output memory
// buffer. For example if the left side of an assignment is a Tensor, we can
- // directly materialize the block in the destination memory. The block
- // expression is still a valid Tensor expression, and can be used to build
- // lazy expressions.
+ // directly materialize the block in the destination memory.
+ //
+ // If strides in the output buffer do not match tensor block strides, the
+ // Tensor expression will be invalid, and should not be used by
+ // TensorBlockAssign or for constructing another block expression.
kMaterializedInOutput
-
- // TODO(ezhulenev): If we know that we are evaluating a block, for the root of
- // the expression tree, it might be beneficial to do an assignment to the
- // output memory buffer, even if it will be impossible to construct a valid
- // block expression after that (e.g. output memory buffer has strides not
- // compatible with TensorMap). This might be a performance optimization for
- // uniformly shaped blocks, because for blocks skewed towards inner dimension
- // `kMaterializedInOutput` should always work.
};
#if !EIGEN_HAS_CXX11
} // namespace TensorBlockKind
@@ -346,6 +363,11 @@ struct XprScalar<void> {
// Tensor), or a memory buffer allocated with scratch allocator, and in this
// case the scratch allocator will deallocate it at the end of block based
// expression execution.
+//
+// If the block was evaluated directly into the output buffer, and strides in
+// the output buffer do not match block strides, the TensorMap expression will
+// be invalid, and should never be used in block assignment or any other tensor
+// expression.
template <typename Scalar, int NumDims, int Layout,
typename IndexType = Eigen::Index>
@@ -358,11 +380,12 @@ class TensorMaterializedBlock {
typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
- const Dimensions& dimensions)
+ const Dimensions& dimensions, bool valid_expr = true)
: m_kind(kind),
m_data(data),
m_dimensions(dimensions),
- m_expr(m_data, m_dimensions) {
+ m_expr(m_data, m_dimensions),
+ m_valid_expr(valid_expr) {
eigen_assert(m_kind == internal::TensorBlockKind::kView ||
m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
m_kind == internal::TensorBlockKind::kMaterializedInOutput);
@@ -372,7 +395,10 @@ class TensorMaterializedBlock {
// NOTE(ezhulenev): Returning XprType by value like in other block types
// causes asan failures. The theory is that XprType::Nested doesn't work
// properly for TensorMap.
- const XprType& expr() const { return m_expr; }
+ const XprType& expr() const {
+ eigen_assert(m_valid_expr);
+ return m_expr;
+ }
const Scalar* data() const { return m_data; }
void cleanup() {}
@@ -427,6 +453,7 @@ class TensorMaterializedBlock {
bool materialized_in_output;
if (block_buffer != NULL) {
+ desc.DropDestinationBuffer();
materialized_in_output = true;
} else {
@@ -461,6 +488,7 @@ class TensorMaterializedBlock {
const Scalar* m_data;
Dimensions m_dimensions;
XprType m_expr;
+ bool m_valid_expr;
};
// -------------------------------------------------------------------------- //
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index dc9551d32..cc0a00e8d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -882,7 +882,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
static const bool
is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 20591da33..7eaf1f09e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -368,7 +368,8 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
const Index chip_dim = m_dim.actualDim();
DSizes<Index, NumInputDims> input_block_dims;
@@ -390,6 +391,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
}
ArgTensorBlock arg_block = m_impl.blockV2(arg_desc, scratch);
+ if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
if (arg_block.data() != NULL) {
// Forward argument block buffer if possible.
@@ -405,6 +407,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
bool materialized_in_output;
if (output_buffer != NULL) {
+ desc.DropDestinationBuffer();
materialized_in_output = true;
} else {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index cc3e67677..2a6d67ad5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -404,7 +404,8 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
return TensorBlockV2(m_impl.blockV2(desc, scratch),
TensorConversionOpBlockFactory());
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index d7bebd30b..132458a20 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -481,7 +481,7 @@ struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
template <typename Dims1, typename Dims2>
-EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index b77d8fe84..4c2767d44 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -166,7 +166,8 @@ struct TensorEvaluator
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
assert(m_data != NULL);
return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
}
@@ -353,7 +354,8 @@ struct TensorEvaluator<const Derived, Device>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
assert(m_data != NULL);
return TensorBlockV2::materialize(m_data, m_dims, desc, scratch);
}
@@ -571,7 +573,8 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
return TensorBlockV2(m_argImpl.blockV2(desc, scratch), m_functor);
}
@@ -729,7 +732,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
desc.DropDestinationBuffer();
return TensorBlockV2(m_leftImpl.blockV2(desc, scratch),
m_rightImpl.blockV2(desc, scratch), m_functor);
@@ -993,7 +997,8 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
// It's unsafe to pass destination buffer to underlying expressions, because
// output might be aliased with one of the inputs.
desc.DropDestinationBuffer();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 6ad6327a6..97ac96db1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -521,19 +521,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const ThreadPoolDevice& device) {
Evaluator evaluator(expr, device);
- Index total_size = array_prod(evaluator.dimensions());
- Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-
- // TODO(ezuhulenev): For small expressions cost of block mapping and
- // resource requirements gathering dominates the cost of expression
- // evaluatiuon.
- if (total_size < cache_size &&
- !ExpressionHasTensorBroadcastingOp<Expression>::value) {
- internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
- /*Tiling=*/TiledEvaluation::Off>::run(expr, device);
- evaluator.cleanup();
- return;
- }
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index d98af1355..7d12e781e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -176,7 +176,8 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
assert(m_buffer != NULL);
return TensorBlockV2::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
}
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 38d0bf7d3..c69e2df92 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -238,7 +238,8 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
static const bool is_col_major =
static_cast<int>(Layout) == static_cast<int>(ColMajor);
@@ -253,6 +254,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
bool materialized_in_output;
if (block_buffer != NULL) {
+ desc.DropDestinationBuffer();
materialized_in_output = true;
} else {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index c9d78ba9b..ab3a979a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -365,7 +365,8 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
eigen_assert(m_impl.data() != NULL);
eigen_assert((kind == Runtime) ||
(kind == OneByN && desc.dimensions()[0] == 1) ||
@@ -611,7 +612,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
- BlockAccessV2 = false,
+ BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
@@ -624,7 +625,12 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ // Tensor slicing does not change the block type.
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+ TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -804,6 +810,15 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
m_impl.block(&input_block);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
+ TensorBlockV2 block = m_impl.blockV2(arg_desc, scratch);
+ if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+ return block;
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
typename Storage::Type result = constCast(m_impl.data());
if (result) {
@@ -900,7 +915,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
- BlockAccessV2 = false,
+ BlockAccessV2 = TensorEvaluator<ArgType, Device>::BlockAccessV2,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
@@ -913,7 +928,8 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
typedef typename TensorBlock::Dimensions TensorBlockDimensions;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -987,6 +1003,13 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
block.block_strides(), TensorBlockDimensions(this->m_inputStrides),
const_cast<ScalarNoConst*>(block.data())));
}
+
+ template<typename TensorBlockV2>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
+ const TensorBlockDesc& desc, const TensorBlockV2& block) {
+ TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
+ this->m_impl.writeBlockV2(arg_desc, block);
+ }
};
namespace internal {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index a0b4e04b1..99c74fc67 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -230,7 +230,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
// If one of the dimensions is zero, return empty block view.
if (desc.size() == 0) {
return TensorBlockV2(internal::TensorBlockKind::kView, NULL,
@@ -240,8 +241,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
// Check if we can reuse `desc` destination, or allocate new scratch buffer.
ScalarNoConst* materialized_output =
desc.template destination<ScalarNoConst, Layout>();
-
bool materialized_in_output;
+
if (materialized_output != NULL) {
desc.DropDestinationBuffer();
materialized_in_output = true;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 6e7abeb09..a51c88540 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -355,7 +355,8 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
- blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
// TODO(ezhulenev): If underlying tensor expression supports and prefers
// block evaluation we must use it. Currently we use coeff and packet
// access into the underlying tensor expression.
@@ -370,10 +371,12 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
const bool inner_dim_reversed = m_reverse[inner_dim_idx];
// Try to reuse destination as an output block buffer.
- CoeffReturnType* block_buffer = desc.template destination<CoeffReturnType, Layout>();
+ CoeffReturnType* block_buffer =
+ desc.template destination<CoeffReturnType, Layout>();
bool materialized_in_output;
if (block_buffer != NULL) {
+ desc.DropDestinationBuffer();
materialized_in_output = true;
} else {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 5e8abad75..bb9908b62 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -116,7 +116,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
- BlockAccessV2 = false,
+ BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -131,7 +131,12 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
TensorBlockReader;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+ Layout, Index>
+ TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
@@ -143,6 +148,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
const Shuffle& shuffle = op.shufflePermutation();
m_is_identity = true;
for (int i = 0; i < NumDims; ++i) {
+ m_shuffle[i] = static_cast<int>(shuffle[i]);
m_dimensions[i] = input_dims[shuffle[i]];
m_inverseShuffle[shuffle[i]] = i;
if (m_is_identity && shuffle[i] != i) {
@@ -241,7 +247,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
1, m_device.firstLevelCacheSize() / sizeof(Scalar));
resources->push_back(internal::TensorOpResourceRequirements(
internal::kUniformAllDims, block_total_size_max));
- m_impl.getResourceRequirements(resources);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
@@ -336,6 +341,78 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool root_of_expr_ast = false) const {
+ assert(m_impl.data() != NULL);
+
+ typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumDims, Layout>
+ TensorBlockIO;
+ typedef typename TensorBlockIO::Dst TensorBlockIODst;
+ typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+ ScalarNoConst* block_buffer = NULL;
+ typename TensorBlockIO::Dimensions block_strides;
+
+ bool materialized_in_output = false;
+ bool has_valid_materialized_expr = true;
+
+ if (desc.HasDestinationBuffer()) {
+ // Check if we can reuse destination buffer for block materialization.
+ const typename TensorBlockDesc::DestinationBuffer& destination_buffer =
+ desc.GetDestinationBuffer();
+
+ const bool dims_match = dimensions_match(
+ desc.dimensions(), destination_buffer.template dimensions<Scalar>());
+
+ const bool strides_match =
+ dimensions_match(internal::strides<Layout>(desc.dimensions()),
+ destination_buffer.template strides<Scalar>());
+
+ if (dims_match && strides_match) {
+ // Destination buffer fits the block contiguously.
+ materialized_in_output = true;
+ has_valid_materialized_expr = true;
+ block_buffer = destination_buffer.template data<ScalarNoConst>();
+ block_strides = internal::strides<Layout>(desc.dimensions());
+ eigen_assert(block_buffer != NULL);
+
+ } else if (dims_match && root_of_expr_ast) {
+ // Destination buffer has strides not matching the block strides, but
+ // for the root of the expression tree it's safe to materialize anyway.
+ materialized_in_output = true;
+ has_valid_materialized_expr = false;
+ block_buffer = destination_buffer.template data<ScalarNoConst>();
+ block_strides = destination_buffer.template strides<ScalarNoConst>();
+ eigen_assert(block_buffer != NULL);
+ }
+
+ if (materialized_in_output) desc.DropDestinationBuffer();
+ }
+
+ // If we were not able to reuse destination buffer, allocate temporary
+ // buffer for block evaluation using scratch allocator.
+ if (!materialized_in_output) {
+ void* mem = scratch.allocate(desc.size() * sizeof(ScalarNoConst));
+ block_buffer = static_cast<ScalarNoConst*>(mem);
+ block_strides = internal::strides<Layout>(desc.dimensions());
+ }
+
+ typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
+ TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
+
+ TensorBlockIODst dst(desc.dimensions(), block_strides, block_buffer);
+
+ typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
+ TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+ return TensorBlockV2(
+ materialized_in_output
+ ? internal::TensorBlockKind::kMaterializedInOutput
+ : internal::TensorBlockKind::kMaterializedInScratch,
+ block_buffer, desc.dimensions(), has_valid_materialized_expr);
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
NumDims * (2 * TensorOpCost::AddCost<Index>() +
@@ -400,7 +477,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
Dimensions m_dimensions;
bool m_is_identity;
- array<Index, NumDims> m_inverseShuffle;
+ array<int, NumDims> m_shuffle;
+ array<Index, NumDims> m_inverseShuffle; // TODO(ezhulenev): Make it int type.
array<Index, NumDims> m_outputStrides;
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
array<Index, NumDims> m_inputStrides;
@@ -431,7 +509,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
- BlockAccessV2 = false,
+ BlockAccessV2 = TensorEvaluator<ArgType, Device>::RawAccess,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
@@ -445,7 +523,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
TensorBlockWriter;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -477,6 +555,63 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
this->m_inverseShuffle,
this->m_unshuffledInputStrides, this->m_impl.data());
}
+
+template <typename TensorBlockV2>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlockV2(
+ const TensorBlockDesc& desc, const TensorBlockV2& block) {
+ eigen_assert(this->m_impl.data() != NULL);
+
+ typedef internal::TensorBlockIOV2<ScalarNoConst, Index, NumDims, Layout>
+ TensorBlockIO;
+ typedef typename TensorBlockIO::Dst TensorBlockIODst;
+ typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+ const Scalar* block_buffer = block.data();
+
+ // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
+ // expression with coefficient and packet access as `src`.
+ void* mem = NULL;
+ if (block_buffer == NULL) {
+ mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
+ ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
+
+ typedef internal::TensorBlockAssignment<
+ ScalarNoConst, NumDims, typename TensorBlockV2::XprType, Index>
+ TensorBlockAssignment;
+
+ TensorBlockAssignment::Run(
+ TensorBlockAssignment::target(
+ desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
+ buf),
+ block.expr());
+
+ block_buffer = buf;
+ }
+
+ // Read from block.
+ TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
+ block_buffer);
+
+ // Write to the output buffer.
+ typename TensorBlockIO::Dimensions output_strides(
+ this->m_unshuffledInputStrides);
+ typename TensorBlockIO::Dimensions output_dimensions;
+ for (int i = 0; i < NumDims; ++i) {
+ output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
+ }
+ TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
+ this->srcCoeff(desc.offset()));
+
+ // Reorder dimensions according to the shuffle.
+ typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
+ for (int i = 0; i < NumDims; ++i) {
+ dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
+ }
+ TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+ // Deallocate temporary buffer used for the block materialization.
+ if (mem != NULL) this->m_device.deallocate(mem);
+ }
};