From 13c3327f5cf829fd9d04a2ab46861e722cd74ca0 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 12 Nov 2019 10:12:28 -0800
Subject: Remove legacy block evaluation support

---
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h        | 202 ---------------------
 1 file changed, 202 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 0fb0a9227..9926046b9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -153,70 +153,6 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
   }
 };
 
-/**
- * Process all the data with a single cpu thread, using blocks of data. By
- * sizing a block to fit L1 cache we get better cache performance.
- */
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, DefaultDevice, Vectorizable,
-                     /*Tiling=*/TiledEvaluation::Legacy> {
- public:
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
-
-  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
-  typedef typename traits<Expression>::Index StorageIndex;
-
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                         const DefaultDevice& device = DefaultDevice()) {
-    typedef TensorBlock<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlock;
-    typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> TensorBlockMapper;
-    typedef typename TensorBlock::Dimensions TensorBlockDimensions;
-
-    Evaluator evaluator(expr, device);
-    Index total_size = array_prod(evaluator.dimensions());
-    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-
-    if (total_size < cache_size
-        && !ExpressionHasTensorBroadcastingOp<Expression>::value) {
-      // TODO(andydavis) Reduce block management overhead for small tensors.
-      internal::TensorExecutor<Expression, DefaultDevice, Vectorizable, /*Tiling=*/TiledEvaluation::Off>::run(expr,device);
-      evaluator.cleanup();
-      return;
-    }
-
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign) {
-      // Size tensor blocks to fit in cache (or requested target block size).
-      Index block_total_size = numext::mini(cache_size, total_size);
-      TensorBlockShapeType block_shape = kSkewedInnerDims;
-      // Query expression tree for desired block size/shape.
-      std::vector<TensorOpResourceRequirements> resources;
-      evaluator.getResourceRequirements(&resources);
-      MergeResourceRequirements(resources, &block_shape, &block_total_size);
-
-      TensorBlockMapper block_mapper(
-          TensorBlockDimensions(evaluator.dimensions()), block_shape,
-          block_total_size);
-      block_total_size = block_mapper.block_dims_total_size();
-
-      ScalarNoConst* data = static_cast<ScalarNoConst*>(
-          device.allocate(block_total_size * sizeof(Scalar)));
-
-      const StorageIndex total_block_count = block_mapper.total_block_count();
-      for (StorageIndex i = 0; i < total_block_count; ++i) {
-        TensorBlock block = block_mapper.GetBlockForIndex(i, data);
-        evaluator.evalBlock(&block);
-      }
-      device.deallocate(data);
-    }
-    evaluator.cleanup();
-  }
-};
-
 /**
  * Process all the data with a single cpu thread, using blocks of data. By
  * sizing a block to fit L1 cache we get better cache performance.
@@ -446,59 +382,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
   }
 };
 
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
-                     /*Tiling=*/TiledEvaluation::Legacy> {
- public:
-  typedef typename traits<Expression>::Index StorageIndex;
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
-
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-  typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims, Evaluator::Layout> BlockMapper;
-  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
-
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                         const ThreadPoolDevice& device) {
-    Evaluator evaluator(expr, device);
-    Index total_size = array_prod(evaluator.dimensions());
-    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-
-    if (total_size < cache_size &&
-        !ExpressionHasTensorBroadcastingOp<Expression>::value) {
-      // TODO(andydavis) Reduce block management overhead for small tensors.
-      internal::TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
-                               /*Tiling=*/TiledEvaluation::Off>::run(expr,
-                                                                     device);
-      evaluator.cleanup();
-      return;
-    }
-
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
-    if (needs_assign) {
-      const TilingContext tiling =
-          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
-                                                   Vectorizable>(device, evaluator);
-
-      device.parallelFor(
-          tiling.block_mapper.total_block_count(), tiling.cost,
-          [=, &device, &evaluator, &tiling](StorageIndex firstIdx,
-                                            StorageIndex lastIdx) {
-            ScalarNoConst* thread_buf =
-                tiling.template GetCurrentThreadBuffer<ScalarNoConst>(device);
-            for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
-              auto block = tiling.block_mapper.GetBlockForIndex(i, thread_buf);
-              evaluator.evalBlock(&block);
-            }
-          });
-      device.deallocate(tiling.buffer);
-    }
-    evaluator.cleanup();
-  }
-};
-
 template <typename Expression, bool Vectorizable>
 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
                      /*Tiling=*/TiledEvaluation::On> {
@@ -603,91 +486,6 @@ class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
   };
 };
 
-template <typename Expression, typename DoneCallback, bool Vectorizable>
-class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
-                          Vectorizable, /*Tileable*/ TiledEvaluation::Legacy> {
- public:
-  typedef typename traits<Expression>::Index StorageIndex;
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
-
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-  typedef TensorBlockMapper<ScalarNoConst, StorageIndex, NumDims,
-                            Evaluator::Layout>
-      BlockMapper;
-  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
-
-  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
-                                           const ThreadPoolDevice& device,
-                                           DoneCallback done) {
-    TensorAsyncExecutorContext* const ctx =
-        new TensorAsyncExecutorContext(expr, device, std::move(done));
-
-    Index total_size = array_prod(ctx->evaluator.dimensions());
-    Index cache_size = device.firstLevelCacheSize() / sizeof(Scalar);
-
-    if (total_size < cache_size &&
-        !ExpressionHasTensorBroadcastingOp<Expression>::value) {
-      auto delete_ctx = [ctx]() { delete ctx; };
-      internal::TensorAsyncExecutor<
-          Expression, ThreadPoolDevice, decltype(delete_ctx), Vectorizable,
-          /*Tileable*/ TiledEvaluation::Off>::runAsync(expr, device, std::move(delete_ctx));
-      return;
-    }
-
-    const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
-      if (!need_assign) {
-        delete ctx;
-        return;
-      }
-
-      ctx->tiling =
-          GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(
-              device, ctx->evaluator);
-
-      auto eval_block = [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
-        ScalarNoConst* thread_buf =
-            ctx->tiling.template GetCurrentThreadBuffer<ScalarNoConst>(
-                ctx->device);
-        for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
-          auto block = ctx->tiling.block_mapper.GetBlockForIndex(i, thread_buf);
-          ctx->evaluator.evalBlock(&block);
-        }
-      };
-      device.parallelForAsync(ctx->tiling.block_mapper.total_block_count(),
-                              ctx->tiling.cost, eval_block,
-                              [ctx]() { delete ctx; });
-    };
-
-    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
-  }
-
- private:
-  struct TensorAsyncExecutorContext {
-    TensorAsyncExecutorContext(const Expression& expr,
-                               const ThreadPoolDevice& thread_pool,
-                               DoneCallback done)
-        : device(thread_pool),
-          evaluator(expr, thread_pool),
-          on_done(std::move(done)) {}
-
-    ~TensorAsyncExecutorContext() {
-      device.deallocate(tiling.buffer);
-      evaluator.cleanup();
-      on_done();
-    }
-
-    const ThreadPoolDevice& device;
-    Evaluator evaluator;
-    TilingContext tiling;
-
-   private:
-    DoneCallback on_done;
-  };
-};
-
 template <typename Expression, typename DoneCallback, bool Vectorizable>
 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
                           Vectorizable, /*Tileable*/ TiledEvaluation::On> {
-- 
cgit v1.2.3