aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2019-06-28 11:13:44 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2019-06-28 11:13:44 -0700
commit878845cb25c1ba9e56883fd0654eafb55a22fc34 (patch)
tree848fdcee1dc377feee2ef45495b3ad21839d0244 /unsupported
parent16a56b2dddbfaf2d4b81d62be5e3139f12783ac8 (diff)
Add block access to TensorReverseOp and make sure that TensorForcedEval uses block access when preferred
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h38
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h8
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h158
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h22
-rw-r--r--unsupported/test/cxx11_tensor_executor.cpp43
6 files changed, 245 insertions, 26 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 554ee5f59..910472ad8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -77,6 +77,8 @@ class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>,
typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
+ static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
: m_xpr(expr), m_buffer(buffer) {}
@@ -105,15 +107,22 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
- IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = true,
PreferBlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = true
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = true
};
+ typedef typename internal::TensorBlock<
+ CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
+ TensorBlock;
+ typedef typename internal::TensorBlockReader<
+ CoeffReturnType, Index, internal::traits<ArgType>::NumDimensions, Layout>
+ TensorBlockReader;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_device(device),
m_buffer(op.buffer()), m_op(op), m_expression(op.expression())
@@ -143,6 +152,18 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ m_impl.getResourceRequirements(resources);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlock* block) {
+ TensorBlock eval_to_block(block->first_coeff_index(), block->block_sizes(),
+ block->tensor_strides(), block->tensor_strides(),
+ m_buffer + block->first_coeff_index());
+ m_impl.block(&eval_to_block);
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
@@ -158,6 +179,11 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(TensorBlock* block) const {
+ assert(m_buffer != NULL);
+ TensorBlockReader::Run(block, m_buffer);
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
// We assume that evalPacket or evalScalar is called to perform the
// assignment and account for the cost of the write here.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 7b5842571..647c98d4e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -346,7 +346,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
// expressions.
const int thread_idx = device.currentThreadId();
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
- Scalar* thread_buf = reinterpret_cast<Scalar*>(
+ ScalarNoConst* thread_buf = reinterpret_cast<ScalarNoConst*>(
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 74b905329..186457a31 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -126,8 +126,14 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
}
typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
EvalTo evalToTmp(m_buffer, m_op);
+
const bool Vectorize = internal::IsVectorizable<Device, const ArgType>::value;
- internal::TensorExecutor<const EvalTo, typename internal::remove_const<Device>::type, Vectorize>::run(evalToTmp, m_device);
+ const bool Tile = TensorEvaluator<const ArgType, Device>::BlockAccess &&
+ TensorEvaluator<const ArgType, Device>::PreferBlockAccess;
+
+ internal::TensorExecutor<const EvalTo,
+ typename internal::remove_const<Device>::type,
+ Vectorize, Tile>::run(evalToTmp, m_device);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index b7fb969f3..33af7d995 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -111,18 +111,25 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
- IsAligned = false,
- PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
- PreferBlockAccess = false,
- Layout = TensorEvaluator<ArgType, Device>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = true,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false,
+
};
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+ typedef internal::TensorBlock<ScalarNoConst, Index, NumDims, Layout>
+ OutputTensorBlock;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
- : m_impl(op.expression(), device), m_reverse(op.reverse())
+ : m_impl(op.expression(), device),
+ m_reverse(op.reverse()),
+ m_device(device)
{
// Reversing a scalar isn't supported yet. It would be a no-op anyway.
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -140,6 +147,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
}
}
+ // Remember the strides for fast division.
+ for (int i = 0; i < NumDims; ++i) {
+ m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]);
+ }
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -159,7 +170,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
- Index idx = index / m_strides[i];
+ Index idx = index / m_fastStrides[i];
index -= idx * m_strides[i];
if (m_reverse[i]) {
idx = m_dimensions[i] - idx - 1;
@@ -173,7 +184,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
}
} else {
for (int i = 0; i < NumDims - 1; ++i) {
- Index idx = index / m_strides[i];
+ Index idx = index / m_fastStrides[i];
index -= idx * m_strides[i];
if (m_reverse[i]) {
idx = m_dimensions[i] - idx - 1;
@@ -212,6 +223,131 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
return rslt;
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements(
+ std::vector<internal::TensorOpResourceRequirements>* resources) const {
+ Eigen::Index block_total_size_max = numext::maxi<Eigen::Index>(
+ 1, m_device.lastLevelCacheSize() / sizeof(Scalar));
+ resources->push_back(internal::TensorOpResourceRequirements(
+ internal::kSkewedInnerDims, block_total_size_max));
+ }
+
+ struct BlockIteratorState {
+ Index block_size;
+ Index block_stride;
+ Index block_span;
+ Index input_size;
+ Index input_stride;
+ Index input_span;
+ Index count;
+ bool reverse;
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void block(
+ OutputTensorBlock* output_block) const {
+ if (NumDims <= 0) return;
+
+ // TODO(ezhulenev): If underlying tensor expression supports and prefers
+ // block evaluation we must use it. Currently we use coeff and packet
+ // access into the underlying tensor expression.
+ // static const bool useBlockAccessForArgType =
+ // TensorEvaluator<ArgType, Device>::BlockAccess &&
+ // TensorEvaluator<ArgType, Device>::PreferBlockAccess;
+
+ static const bool isColMajor =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+ static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
+ const bool inner_dim_reversed = m_reverse[inner_dim_idx];
+
+ CoeffReturnType* data = output_block->data();
+ Index block_offset = 0;
+
+ Index input_offset = reverseIndex(output_block->first_coeff_index());
+
+ // Initialize output block iterator state. Dimension in this array are
+ // always in inner_most -> outer_most order (col major layout).
+ array<BlockIteratorState, NumDims> it;
+ for (Index i = 0; i < NumDims; ++i) {
+ const Index dim = isColMajor ? i : NumDims - 1 - i;
+ it[i].block_size = output_block->block_sizes()[dim];
+ it[i].block_stride = output_block->block_strides()[dim];
+ it[i].block_span = it[i].block_stride * (it[i].block_size - 1);
+ it[i].input_size = m_dimensions[dim];
+ it[i].input_stride = m_strides[dim];
+ it[i].input_span = it[i].input_stride * (it[i].input_size - 1);
+ it[i].count = 0;
+ it[i].reverse = m_reverse[dim];
+
+ if (it[i].reverse) {
+ it[i].input_stride = -1 * it[i].input_stride;
+ it[i].input_span = -1 * it[i].input_span;
+ }
+ }
+
+ // If multiple inner dimensions have the same reverse flag, check if we can
+ // merge them into a single virtual inner dimension.
+ int effective_inner_dim = 0;
+ for (int i = 1; i < NumDims; ++i) {
+ if (it[i].reverse != it[effective_inner_dim].reverse) break;
+ if (it[i].block_stride != it[effective_inner_dim].input_size) break;
+ if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
+
+ it[i].block_size = it[effective_inner_dim].block_size * it[i].block_size;
+ it[i].input_size = it[effective_inner_dim].input_size * it[i].input_size;
+
+ it[i].block_stride = 1;
+ it[i].input_stride = (inner_dim_reversed ? -1 : 1);
+
+ it[i].block_span = it[i].block_stride * (it[i].block_size - 1);
+ it[i].input_span = it[i].input_stride * (it[i].input_size - 1);
+
+ effective_inner_dim = i;
+ }
+
+ eigen_assert(it[effective_inner_dim].block_stride == 1);
+ eigen_assert(it[effective_inner_dim].input_stride ==
+ (inner_dim_reversed ? -1 : 1));
+
+ const Index inner_dim_size = it[effective_inner_dim].block_size;
+
+ while (it[NumDims - 1].count < it[NumDims - 1].block_size) {
+ // Copy inner-most dimension data from reversed location in input.
+ Index dst = block_offset;
+ Index src = input_offset;
+
+ // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
+ // worse results in benchmarks than a simple coefficient loop.
+ if (inner_dim_reversed) {
+ for (Index i = 0; i < inner_dim_size; ++i) {
+ data[dst] = m_impl.coeff(src);
+ ++dst;
+ --src;
+ }
+ } else {
+ for (Index i = 0; i < inner_dim_size; ++i) {
+ data[dst] = m_impl.coeff(src);
+ ++dst;
+ ++src;
+ }
+ }
+
+ // For the 1d tensor we need to generate only one inner-most dimension.
+ if ((NumDims - effective_inner_dim) == 1) break;
+
+ // Update offset.
+ for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
+ if (++it[i].count < it[i].block_size) {
+ block_offset += it[i].block_stride;
+ input_offset += it[i].input_stride;
+ break;
+ }
+ if (i != NumDims - 1) it[i].count = 0;
+ block_offset -= it[i].block_span;
+ input_offset -= it[i].input_span;
+ }
+ }
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
2 * TensorOpCost::MulCost<Index>() +
@@ -235,8 +371,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
+ array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides;
TensorEvaluator<ArgType, Device> m_impl;
ReverseDimensions m_reverse;
+ const Device& m_device;
};
// Eval as lvalue
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 416948765..b577d4d36 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -270,6 +270,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
input_block_strides[i + 1] * input_block_sizes[i + 1];
}
}
+ DSizes<internal::TensorIntDivisor<Index>, NumDims> fast_input_block_strides;
+ for (int i = 0; i < NumDims; ++i) {
+ fast_input_block_strides[i] =
+ internal::TensorIntDivisor<Index>(input_block_strides[i]);
+ }
// Read input block.
TensorBlock input_block(srcCoeff(output_block->first_coeff_index()),
@@ -293,8 +298,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
continue;
}
- Index output_index = GetBlockOutputIndex(input_index, input_block_strides,
- output_block_strides);
+ Index output_index =
+ GetBlockOutputIndex(input_index, input_block_strides,
+ output_block_strides, fast_input_block_strides);
if (output_index == input_index) {
// Coefficient already in place.
bitmap[output_index] = true;
@@ -312,8 +318,9 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
data[output_index] = shuffled_value;
shuffled_value = evicted_value;
bitmap[output_index] = true;
- output_index = GetBlockOutputIndex(output_index, input_block_strides,
- output_block_strides);
+ output_index =
+ GetBlockOutputIndex(output_index, input_block_strides,
+ output_block_strides, fast_input_block_strides);
} while (output_index != input_index);
data[output_index] = shuffled_value;
@@ -341,11 +348,12 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
Index input_index,
const DSizes<Index, NumDims>& input_block_strides,
- const DSizes<Index, NumDims>& output_block_strides) const {
+ const DSizes<Index, NumDims>& output_block_strides,
+ const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
Index output_index = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
- const Index idx = input_index / input_block_strides[i];
+ const Index idx = input_index / fast_input_block_strides[i];
output_index += idx * output_block_strides[m_inverseShuffle[i]];
input_index -= idx * input_block_strides[i];
}
@@ -353,7 +361,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
output_block_strides[m_inverseShuffle[0]];
} else {
for (int i = 0; i < NumDims - 1; ++i) {
- const Index idx = input_index / input_block_strides[i];
+ const Index idx = input_index / fast_input_block_strides[i];
output_index += idx * output_block_strides[m_inverseShuffle[i]];
input_index -= idx * input_block_strides[i];
}
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
index 162dab7b8..42f0a1bc3 100644
--- a/unsupported/test/cxx11_tensor_executor.cpp
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -527,6 +527,41 @@ static void test_execute_generator_op(Device d)
}
}
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ bool Tileable, int Layout>
+static void test_execute_reverse_rvalue(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
+ Tensor <T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Reverse half of the dimensions.
+ Eigen::array<bool, NumDims> reverse;
+ for (int i = 0; i < NumDims; ++i) reverse[i] = (dims[i] % 2 == 0);
+
+ const auto expr = src.reverse(reverse);
+
+ // We assume that reversing on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor <T, NumDims, Options, Index> golden;
+ golden = expr;
+
+ // Now do the reversing using configured tensor executor.
+ Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tileable>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
#define CALL_SUBTEST_PART(PART) \
CALL_SUBTEST_##PART
@@ -613,8 +648,14 @@ EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
+
// Force CMake to split this test.
- // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13
+ // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14
}
#undef CALL_SUBTEST_COMBINATIONS