aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2019-09-24 12:52:45 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2019-09-24 12:52:45 -0700
commitef9dfee7bdc8e0d82c9b7ddf9414ef99d866d7ba (patch)
tree490a8ae1f247cf226475f504ea1d3ab305b98097 /unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
parentefd9867ff0e8df23016ac6c9828d0d7bf8bec1b1 (diff)
Tensor block evaluation V2 support for unary/binary/broadcsting
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h172
1 files changed, 150 insertions, 22 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index cf07656b3..a7cb8dc97 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -23,7 +23,7 @@ namespace Eigen {
*
* @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
* instructions)
- * @tparam Tileable can use block based tensor evaluation
+ * @tparam Tiling can use block based tensor evaluation
* (see TensorBlock.h)
*/
namespace internal {
@@ -76,8 +76,13 @@ struct ExpressionHasTensorBroadcastingOp<
* Default strategy: the expression is evaluated sequentially with a single cpu
* thread, without vectorization and block evaluation.
*/
+#if EIGEN_HAS_CXX11
template <typename Expression, typename Device, bool Vectorizable,
- bool Tileable>
+ TiledEvaluation Tiling>
+#else
+ template <typename Expression, typename Device, bool Vectorizable,
+ TiledEvaluation::TiledEvaluation Tiling>
+#endif
class TensorExecutor {
public:
typedef typename Expression::Index StorageIndex;
@@ -109,8 +114,8 @@ class TensorAsyncExecutor {};
* Process all the data with a single cpu thread, using vectorized instructions.
*/
template <typename Expression>
-class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
- /*Tileable*/ false> {
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
+ /*Tiling=*/TiledEvaluation::Off> {
public:
typedef typename Expression::Index StorageIndex;
@@ -152,7 +157,7 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable*/ true,
*/
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
- /*Tileable*/ true> {
+ /*Tiling=*/TiledEvaluation::Legacy> {
public:
typedef typename traits<Expression>::Scalar Scalar;
typedef typename remove_const<Scalar>::type ScalarNoConst;
@@ -176,8 +181,7 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
if (total_size < cache_size
&& !ExpressionHasTensorBroadcastingOp<Expression>::value) {
// TODO(andydavis) Reduce block management overhead for small tensors.
- internal::TensorExecutor<Expression, DefaultDevice, Vectorizable,
- /*Tileable*/ false>::run(expr, device);
+ internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tiling=*/TiledEvaluation::Off>::run(expr,device);
evaluator.cleanup();
return;
}
@@ -212,6 +216,70 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
};
/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+ /*Tiling=*/TiledEvaluation::On> {
+ public:
+ typedef typename traits<Expression>::Scalar Scalar;
+ typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+ typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+ typedef typename traits<Expression>::Index StorageIndex;
+
+ static const int NumDims = traits<Expression>::NumDimensions;
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE void run(const Expression& expr,
+ const DefaultDevice& device = DefaultDevice()) {
+ typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlock;
+ typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
+ typedef typename TensorBlock::Dimensions TensorBlockDimensions;
+
+ typedef internal::TensorBlockDescriptor<NumDims> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<DefaultDevice>
+ TensorBlockScratch;
+
+ Evaluator evaluator(expr, device);
+ Index total_size = array_prod(evaluator.dimensions());
+ Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
+
+ // TODO(ezhulenev): Do not use tiling for small tensors?
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+
+ if (needs_assign) {
+ // Size tensor blocks to fit in cache (or requested target block size).
+ Index block_total_size = numext::mini(cache_size, total_size);
+ TensorBlockShapeType block_shape = kSkewedInnerDims;
+ // Query expression tree for desired block size/shape.
+ std::vector<TensorOpResourceRequirements> resources;
+ evaluator.getResourceRequirements(&resources);
+ MergeResourceRequirements(resources, &block_shape, &block_total_size);
+
+ TensorBlockMapper block_mapper(
+ TensorBlockDimensions(evaluator.dimensions()), block_shape,
+ block_total_size);
+ block_total_size = block_mapper.block_dims_total_size();
+
+ // Share scratch memory allocator between all blocks.
+ TensorBlockScratch scratch(device);
+
+ const StorageIndex total_block_count = block_mapper.total_block_count();
+ for (StorageIndex i = 0; i < total_block_count; ++i) {
+ TensorBlock block = block_mapper.GetBlockForIndex(i, NULL);
+
+ TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes());
+ evaluator.evalBlockV2(desc, scratch);
+ scratch.reset();
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+/**
* Multicore strategy: the index space is partitioned and each partition is
* executed on a single core.
*
@@ -256,10 +324,11 @@ struct TensorExecutorTilingContext {
};
// Computes a block evaluation parameters, and allocates temporary memory buffer
-// for blocks. See TensorExecutor/TensorAsyncExecutor (Tileable=true) below.
+// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
- const ThreadPoolDevice& device, const Evaluator& evaluator) {
+ const ThreadPoolDevice& device, const Evaluator& evaluator,
+ bool allocate_buffer = true) {
// Prefer blocks skewed toward inner dimension.
TensorBlockShapeType block_shape = kSkewedInnerDims;
Index block_total_size = 0;
@@ -284,7 +353,13 @@ TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
const size_t aligned_blocksize =
align *
divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
- void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
+
+ // TODO(ezhulenev): In new block evaluation framework there is no need for
+ // allocating temporary buffers, remove this after migration.
+ void* buf = NULL;
+ if (allocate_buffer) {
+ buf = device.allocate((num_threads + 1) * aligned_blocksize);
+ }
return {block_mapper, cost * block_size, buf, aligned_blocksize};
}
@@ -344,8 +419,8 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
}
};
-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
@@ -369,7 +444,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
};
template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ true> {
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+ /*Tiling=*/TiledEvaluation::Legacy> {
public:
typedef typename traits<Expression>::Index StorageIndex;
typedef typename traits<Expression>::Scalar Scalar;
@@ -387,11 +463,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
Index total_size = array_prod(evaluator.dimensions());
Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
- if (total_size < cache_size
- && !ExpressionHasTensorBroadcastingOp<Expression>::value) {
+ if (total_size < cache_size &&
+ !ExpressionHasTensorBroadcastingOp<Expression>::value) {
// TODO(andydavis) Reduce block management overhead for small tensors.
internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
- /*Tileable*/ false>::run(expr, device);
+ /*Tiling=*/TiledEvaluation::Off>::run(expr,
+ device);
evaluator.cleanup();
return;
}
@@ -419,6 +496,57 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
}
};
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+ /*Tiling=*/TiledEvaluation::On> {
+ public:
+ typedef typename traits<Expression>::Index IndexType;
+ typedef typename traits<Expression>::Scalar Scalar;
+ typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+ static const int NumDims = traits<Expression>::NumDimensions;
+
+ typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+ typedef TensorBlockMapper<ScalarNoConst, IndexType, NumDims,
+ Evaluator::Layout>
+ BlockMapper;
+ typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+ typedef internal::TensorBlockDescriptor<NumDims, IndexType>
+ TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
+ TensorBlockScratch;
+
+ static EIGEN_STRONG_INLINE void run(const Expression& expr,
+ const ThreadPoolDevice& device) {
+ Evaluator evaluator(expr, device);
+
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+ if (needs_assign) {
+ const TilingContext tiling =
+ internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
+ Vectorizable>(
+ device, evaluator, /*allocate_buffer=*/false);
+
+ auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
+ IndexType lastBlockIdx) {
+ TensorBlockScratch scratch(device);
+
+ for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
+ auto block = tiling.block_mapper.GetBlockForIndex(block_idx, nullptr);
+ TensorBlockDesc desc(block.first_coeff_index(), block.block_sizes());
+ evaluator.evalBlockV2(desc, scratch);
+ scratch.reset();
+ }
+ };
+
+ device.parallelFor(tiling.block_mapper.total_block_count(), tiling.cost,
+ eval_block);
+ }
+ evaluator.cleanup();
+ }
+};
+
template <typename Expression, typename DoneCallback, bool Vectorizable,
bool Tileable>
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
@@ -562,8 +690,8 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU)
-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
static void run(const Expression& expr, const GpuDevice& device);
@@ -612,8 +740,8 @@ EigenMetaKernel(Evaluator eval, StorageIndex size) {
}
/*static*/
-template <typename Expression, bool Vectorizable, bool Tileable>
-EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable>::run(
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
const Expression& expr, const GpuDevice& device) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
@@ -711,8 +839,8 @@ struct ExecExprFunctorKernel<Expr, false, Evaluator>
range_, vectorizable_threads_, evaluator) {}
};
-template <typename Expression, bool Vectorizable, bool Tileable>
-class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tileable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index Index;
static EIGEN_STRONG_INLINE void run(const Expression &expr, const Eigen::SyclDevice &dev) {