aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2019-10-10 10:56:58 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2019-10-10 10:56:58 -0700
commita411e9f344a354673b72a490433cf3cc2148ddf1 (patch)
tree65d0e152a0cc6649ecb8b67c0579386475dbaf53 /unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
parentb03eb63d7cb869cc4486ac393fad75fbcc36027f (diff)
Block evaluation for TensorGenerator + TensorReverse + fixed bug in tensor reverse op
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h196
1 files changed, 169 insertions, 27 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 855d04eb7..6e7abeb09 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -116,7 +116,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = true,
- BlockAccessV2 = false,
+ BlockAccessV2 = NumDims > 0,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
@@ -130,7 +130,15 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
OutputTensorBlock;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlockV2;
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlockV2
+ ArgTensorBlock;
+
+ typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+ Layout, Index>
+ TensorBlockV2;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
@@ -240,17 +248,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
internal::kSkewedInnerDims, block_total_size_max));
}
- struct BlockIteratorState {
- Index block_size;
- Index block_stride;
- Index block_span;
- Index input_size;
- Index input_stride;
- Index input_span;
- Index count;
- bool reverse;
- };
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
OutputTensorBlock* output_block) const {
if (NumDims <= 0) return;
@@ -278,15 +275,16 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
array<BlockIteratorState, NumDims> it;
for (Index i = 0; i < NumDims; ++i) {
const Index dim = isColMajor ? i : NumDims - 1 - i;
- it[i].block_size = output_block->block_sizes()[dim];
- it[i].block_stride = output_block->block_strides()[dim];
- it[i].block_span = it[i].block_stride * (it[i].block_size - 1);
- it[i].input_size = m_dimensions[dim];
- it[i].input_stride = m_strides[dim];
- it[i].input_span = it[i].input_stride * (it[i].input_size - 1);
+ it[i].size = output_block->block_sizes()[dim];
it[i].count = 0;
it[i].reverse = m_reverse[dim];
+ it[i].block_stride = output_block->block_strides()[dim];
+ it[i].block_span = it[i].block_stride * (it[i].size - 1);
+
+ it[i].input_stride = m_strides[dim];
+ it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
if (it[i].reverse) {
it[i].input_stride = -1 * it[i].input_stride;
it[i].input_span = -1 * it[i].input_span;
@@ -298,17 +296,16 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
int effective_inner_dim = 0;
for (int i = 1; i < NumDims; ++i) {
if (it[i].reverse != it[effective_inner_dim].reverse) break;
- if (it[i].block_stride != it[effective_inner_dim].input_size) break;
+ if (it[i].block_stride != it[effective_inner_dim].size) break;
if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
- it[i].block_size = it[effective_inner_dim].block_size * it[i].block_size;
- it[i].input_size = it[effective_inner_dim].input_size * it[i].input_size;
+ it[i].size = it[effective_inner_dim].size * it[i].size;
it[i].block_stride = 1;
it[i].input_stride = (inner_dim_reversed ? -1 : 1);
- it[i].block_span = it[i].block_stride * (it[i].block_size - 1);
- it[i].input_span = it[i].input_stride * (it[i].input_size - 1);
+ it[i].block_span = it[i].block_stride * (it[i].size - 1);
+ it[i].input_span = it[i].input_stride * (it[i].size - 1);
effective_inner_dim = i;
}
@@ -317,9 +314,9 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
eigen_assert(it[effective_inner_dim].input_stride ==
(inner_dim_reversed ? -1 : 1));
- const Index inner_dim_size = it[effective_inner_dim].block_size;
+ const Index inner_dim_size = it[effective_inner_dim].size;
- while (it[NumDims - 1].count < it[NumDims - 1].block_size) {
+ while (it[NumDims - 1].count < it[NumDims - 1].size) {
// Copy inner-most dimension data from reversed location in input.
Index dst = block_offset;
Index src = input_offset;
@@ -345,7 +342,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
// Update offset.
for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
- if (++it[i].count < it[i].block_size) {
+ if (++it[i].count < it[i].size) {
block_offset += it[i].block_stride;
input_offset += it[i].input_stride;
break;
@@ -357,6 +354,131 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
}
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockV2
+ blockV2(TensorBlockDesc& desc, TensorBlockScratch& scratch) const {
+ // TODO(ezhulenev): If underlying tensor expression supports and prefers
+ // block evaluation we must use it. Currently we use coeff and packet
+ // access into the underlying tensor expression.
+ // static const bool useBlockAccessForArgType =
+ // TensorEvaluator<ArgType, Device>::BlockAccess &&
+ // TensorEvaluator<ArgType, Device>::PreferBlockAccess;
+
+ static const bool isColMajor =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+ static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
+ const bool inner_dim_reversed = m_reverse[inner_dim_idx];
+
+ // Try to reuse destination as an output block buffer.
+ CoeffReturnType* block_buffer = desc.template destination<CoeffReturnType, Layout>();
+ bool materialized_in_output;
+
+ if (block_buffer != NULL) {
+ materialized_in_output = true;
+
+ } else {
+ materialized_in_output = false;
+ void* mem = scratch.allocate(desc.size() * sizeof(CoeffReturnType));
+ block_buffer = static_cast<CoeffReturnType*>(mem);
+ }
+
+ // Offset in the output block.
+ Index block_offset = 0;
+
+ // Offset in the input Tensor.
+ Index input_offset = reverseIndex(desc.offset());
+
+ // Initialize output block iterator state. Dimension in this array are
+ // always in inner_most -> outer_most order (col major layout).
+ array<BlockIteratorState, NumDims> it;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = isColMajor ? i : NumDims - 1 - i;
+ it[i].size = desc.dimension(dim);
+ it[i].count = 0;
+ it[i].reverse = m_reverse[dim];
+
+ it[i].block_stride =
+ i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride);
+ it[i].block_span = it[i].block_stride * (it[i].size - 1);
+
+ it[i].input_stride = m_strides[dim];
+ it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+ if (it[i].reverse) {
+ it[i].input_stride = -1 * it[i].input_stride;
+ it[i].input_span = -1 * it[i].input_span;
+ }
+ }
+
+ // If multiple inner dimensions have the same reverse flag, check if we can
+ // merge them into a single virtual inner dimension.
+ int effective_inner_dim = 0;
+ for (int i = 1; i < NumDims; ++i) {
+ if (it[i].reverse != it[effective_inner_dim].reverse) break;
+ if (it[i].block_stride != it[effective_inner_dim].size) break;
+ if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
+
+ it[i].size = it[effective_inner_dim].size * it[i].size;
+
+ it[i].block_stride = 1;
+ it[i].input_stride = (inner_dim_reversed ? -1 : 1);
+
+ it[i].block_span = it[i].block_stride * (it[i].size - 1);
+ it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+ effective_inner_dim = i;
+ }
+
+ eigen_assert(it[effective_inner_dim].block_stride == 1);
+ eigen_assert(it[effective_inner_dim].input_stride ==
+ (inner_dim_reversed ? -1 : 1));
+
+ const Index inner_dim_size = it[effective_inner_dim].size;
+
+ while (it[NumDims - 1].count < it[NumDims - 1].size) {
+ // Copy inner-most dimension data from reversed location in input.
+ Index dst = block_offset;
+ Index src = input_offset;
+
+ // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
+ // worse results in benchmarks than a simple coefficient loop.
+ if (inner_dim_reversed) {
+ for (Index i = 0; i < inner_dim_size; ++i) {
+ block_buffer[dst] = m_impl.coeff(src);
+ ++dst;
+ --src;
+ }
+ } else {
+ for (Index i = 0; i < inner_dim_size; ++i) {
+ block_buffer[dst] = m_impl.coeff(src);
+ ++dst;
+ ++src;
+ }
+ }
+
+ // For the 1d tensor we need to generate only one inner-most dimension.
+ if ((NumDims - effective_inner_dim) == 1) break;
+
+ // Update offset.
+ for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
+ if (++it[i].count < it[i].size) {
+ block_offset += it[i].block_stride;
+ input_offset += it[i].input_stride;
+ break;
+ }
+ if (i != NumDims - 1) it[i].count = 0;
+ block_offset -= it[i].block_span;
+ input_offset -= it[i].input_span;
+ }
+ }
+
+ return TensorBlockV2(
+ materialized_in_output
+ ? internal::TensorBlockKind::kMaterializedInOutput
+ : internal::TensorBlockKind::kMaterializedInScratch,
+ block_buffer, desc.dimensions());
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
2 * TensorOpCost::MulCost<Index>() +
@@ -386,6 +508,26 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
TensorEvaluator<ArgType, Device> m_impl;
ReverseDimensions m_reverse;
const Device EIGEN_DEVICE_REF m_device;
+
+ private:
+ struct BlockIteratorState {
+ BlockIteratorState()
+ : size(0),
+ count(0),
+ reverse(false),
+ block_stride(0),
+ block_span(0),
+ input_stride(0),
+ input_span(0) {}
+
+ Index size;
+ Index count;
+ bool reverse;
+ Index block_stride;
+ Index block_span;
+ Index input_stride;
+ Index input_span;
+ };
};
// Eval as lvalue