aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2018-07-25 13:51:10 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2018-07-25 13:51:10 -0700
commit6913221c43c6ad41b1fbfc0d263d2764abd11ad2 (patch)
treee5dbd8f9c73087d37b1e812bc679d1dec2d3bfcd /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parentd55efa6f0f9ab9ec758c6b40204be476c01b7528 (diff)
Add tiled evaluation support to TensorExecutor
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h262
1 files changed, 199 insertions, 63 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 53640c6aa..024de3696 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -12,29 +12,37 @@
namespace Eigen {
-/** \class TensorExecutor
- * \ingroup CXX11_Tensor_Module
- *
- * \brief The tensor executor class.
- *
- * This class is responsible for launch the evaluation of the expression on
- * the specified computing device.
- */
+/**
+ * \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ *
+ * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
+ * instructions)
+ * @tparam Tileable can use block based tensor evaluation
+ * (see TensorBlock.h)
+ */
namespace internal {
-// Default strategy: the expression is evaluated with a single cpu thread.
-template<typename Expression, typename Device, bool Vectorizable>
-class TensorExecutor
-{
+/**
+ * Default strategy: the expression is evaluated sequentially with a single cpu
+ * thread, without vectorization and block evaluation.
+ */
+template <typename Expression, typename Device, bool Vectorizable,
+ bool Tileable>
+class TensorExecutor {
public:
typedef typename Expression::Index Index;
EIGEN_DEVICE_FUNC
- static inline void run(const Expression& expr, const Device& device = Device())
- {
+ static inline void run(const Expression& expr,
+ const Device& device = Device()) {
TensorEvaluator<Expression, Device> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign)
- {
+ if (needs_assign) {
const Index size = array_prod(evaluator.dimensions());
for (Index i = 0; i < size; ++i) {
evaluator.evalScalar(i);
@@ -44,12 +52,14 @@ class TensorExecutor
}
};
-
-template<typename Expression>
-class TensorExecutor<Expression, DefaultDevice, true>
-{
+/**
+ * Process all the data with a single cpu thread, using vectorized instructions.
+ */
+template <typename Expression>
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true, /*Tilable*/ false> {
public:
typedef typename Expression::Index Index;
+
EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
{
@@ -58,9 +68,11 @@ class TensorExecutor<Expression, DefaultDevice, true>
if (needs_assign)
{
const Index size = array_prod(evaluator.dimensions());
- const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
- // Give the compiler a strong hint to unroll the loop. But don't insist
- // on unrolling, because if the function is expensive the compiler should not
+ const int PacketSize = unpacket_traits<typename TensorEvaluator<
+ Expression, DefaultDevice>::PacketReturnType>::size;
+
+ // Give compiler a strong possibility to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
@@ -80,9 +92,75 @@ class TensorExecutor<Expression, DefaultDevice, true>
}
};
+/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tilable*/ true> {
+ public:
+ typedef typename Expression::Index Index;
+
+ EIGEN_DEVICE_FUNC
+ static inline void run(const Expression& expr,
+ const DefaultDevice& device = DefaultDevice()) {
+ using Evaluator = TensorEvaluator<Expression, DefaultDevice>;
+ using Index = typename traits<Expression>::Index;
+ const int NumDims = traits<Expression>::NumDimensions;
-// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+ using Scalar = typename traits<Expression>::Scalar;
+ using ScalarNoConst = typename remove_const<Scalar>::type;
+
+ using TensorBlock =
+ TensorBlock<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
+ using TensorBlockMapper =
+ TensorBlockMapper<ScalarNoConst, Index, NumDims, Evaluator::Layout>;
+
+ Evaluator evaluator(expr, device);
+ std::size_t total_size = array_prod(evaluator.dimensions());
+ std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+
+ if (total_size < cache_size) {
+ // TODO(andydavis) Reduce block management overhead for small tensors.
+ // TODO(wuke) Do not do this when evaluating TensorBroadcastingOp.
+ internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
+ false>::run(expr, device);
+ return;
+ }
+
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign) {
+ // Size tensor blocks to fit in cache (or requested target block size).
+ size_t block_total_size = numext::mini(cache_size, total_size);
+ TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
+ // Query expression tree for desired block size/shape.
+ std::vector<TensorOpResourceRequirements> resources;
+ evaluator.getResourceRequirements(&resources);
+ MergeResourceRequirements(resources, &block_shape, &block_total_size);
+
+ TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
+ block_total_size);
+ block_total_size = block_mapper.block_dims_total_size();
+
+ Scalar* data = static_cast<Scalar*>(
+ device.allocate(block_total_size * sizeof(Scalar)));
+
+ const Index total_block_count = block_mapper.total_block_count();
+ for (Index i = 0; i < total_block_count; ++i) {
+ TensorBlock block = block_mapper.GetBlockForIndex(i, data);
+ evaluator.evalBlock(&block);
+ }
+ device.deallocate(data);
+ }
+ evaluator.cleanup();
+ }
+};
+
+/**
+ * Multicore strategy: the index space is partitioned and each partition is
+ * executed on a single core.
+ */
#ifdef EIGEN_USE_THREADS
template <typename Evaluator, typename Index, bool Vectorizable>
struct EvalRange {
@@ -100,7 +178,7 @@ struct EvalRange {
};
template <typename Evaluator, typename Index>
-struct EvalRange<Evaluator, Index, true> {
+struct EvalRange<Evaluator, Index, /*Vectorizable*/ true> {
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
@@ -110,8 +188,8 @@ struct EvalRange<Evaluator, Index, true> {
if (last - first >= PacketSize) {
eigen_assert(first % PacketSize == 0);
Index last_chunk_offset = last - 4 * PacketSize;
- // Give the compiler a strong hint to unroll the loop. But don't insist
- // on unrolling, because if the function is expensive the compiler should not
+ // Give compiler a strong possibility to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
for (; i <= last_chunk_offset; i += 4*PacketSize) {
for (Index j = 0; j < 4; j++) {
@@ -138,55 +216,113 @@ struct EvalRange<Evaluator, Index, true> {
}
};
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, bool Tileable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
public:
typedef typename Expression::Index Index;
- static inline void run(const Expression& expr, const ThreadPoolDevice& device)
- {
+
+ static inline void run(const Expression& expr,
+ const ThreadPoolDevice& device) {
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+ typedef EvalRange<Evaluator, Index, Vectorizable> EvalRange;
+
Evaluator evaluator(expr, device);
- const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
- if (needs_assign)
- {
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+ if (needs_assign) {
+ const Index PacketSize =
+ Vectorizable
+ ? unpacket_traits<typename Evaluator::PacketReturnType>::size
+ : 1;
const Index size = array_prod(evaluator.dimensions());
- size_t num_threads = device.numThreads();
- if (num_threads > 1) {
- num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
- size, evaluator.costPerCoeff(Vectorizable), num_threads);
- }
- if (num_threads == 1) {
- EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
- } else {
- const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
- Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
- const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
- const Index numblocks = size / blocksize;
-
- Barrier barrier(numblocks);
- for (int i = 0; i < numblocks; ++i) {
- device.enqueue_with_barrier(
- &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
- &evaluator, i * blocksize, (i + 1) * blocksize);
- }
- if (numblocks * blocksize < size) {
- EvalRange<Evaluator, Index, Vectorizable>::run(
- &evaluator, numblocks * blocksize, size);
- }
- barrier.Wait();
- }
+ device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+ EvalRange::alignBlockSize,
+ [&evaluator](Index first, Index last) {
+ EvalRange::run(&evaluator, first, last);
+ });
+ }
+ evaluator.cleanup();
+ }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
+ public:
+ typedef typename Expression::Index Index;
+
+ static inline void run(const Expression& expr,
+ const ThreadPoolDevice& device) {
+ typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+ typedef typename internal::remove_const<
+ typename traits<Expression>::Scalar>::type Scalar;
+ typedef typename traits<Expression>::Index Index;
+
+ static const int NumDims = traits<Expression>::NumDimensions;
+
+ typedef TensorBlock<Scalar, Index, NumDims, Evaluator::Layout> TensorBlock;
+ typedef TensorBlockMapper<Scalar, Index, NumDims, Evaluator::Layout>
+ TensorBlockMapper;
+
+ Evaluator evaluator(expr, device);
+ std::size_t total_size = array_prod(evaluator.dimensions());
+ std::size_t cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+ if (total_size < cache_size) {
+ // TODO(andydavis) Reduce block management overhead for small tensors.
+ internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+ false>::run(expr, device);
+ evaluator.cleanup();
+ return;
+ }
+
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+ if (needs_assign) {
+ TensorBlockShapeType block_shape = TensorBlockShapeType::kSkewedInnerDims;
+ size_t block_total_size = 0;
+ // Query expression tree for desired block size/shape.
+ std::vector<internal::TensorOpResourceRequirements> resources;
+ evaluator.getResourceRequirements(&resources);
+ MergeResourceRequirements(resources, &block_shape, &block_total_size);
+ int num_threads = device.numThreads();
+
+ // Estimate minimum block size based on cost.
+ TensorOpCost cost = evaluator.costPerCoeff(Vectorizable);
+ double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, cost);
+ size_t block_size = static_cast<size_t>(1.0 / taskSize);
+ TensorBlockMapper block_mapper(evaluator.dimensions(), block_shape,
+ block_size);
+ block_size = block_mapper.block_dims_total_size();
+ const size_t aligned_blocksize =
+ EIGEN_MAX_ALIGN_BYTES *
+ divup<size_t>(block_size * sizeof(Scalar), EIGEN_MAX_ALIGN_BYTES);
+ void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
+ device.parallelFor(
+ block_mapper.total_block_count(), cost * block_size,
+ [=, &device, &evaluator, &block_mapper](Index first, Index last) {
+ // currentThreadId() returns -1 if called from a thread not in the
+ // threadpool, such as the main thread dispatching Eigen
+ // expressions.
+ const int thread_idx = device.currentThreadId();
+ eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
+ Scalar* thread_buf = reinterpret_cast<Scalar*>(
+ static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
+ for (Index i = first; i < last; ++i) {
+ auto block = block_mapper.GetBlockForIndex(i, thread_buf);
+ evaluator.evalBlock(&block);
+ }
+ });
+ device.deallocate(buf);
}
evaluator.cleanup();
}
};
+
#endif // EIGEN_USE_THREADS
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU)
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, GpuDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, bool Tileable>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
public:
typedef typename Expression::Index Index;
static void run(const Expression& expr, const GpuDevice& device);
@@ -236,8 +372,8 @@ EigenMetaKernel(Evaluator eval, Index size) {
}
/*static*/
-template <typename Expression, bool Vectorizable>
-inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
+template <typename Expression, bool Vectorizable, bool Tileable>
+inline void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
const Expression& expr, const GpuDevice& device) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);