From 7b76c85daf293f903682f26733918b7dbfde740a Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Tue, 5 May 2020 00:19:43 +0000
Subject: Vectorize and parallelize TensorScanOp.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TensorScanOp is used in TensorFlow for a number of operations, such as cumulative logexp reduction and cumulative sum and product reductions.

The benchmarks numbers below are for cumulative row- and column reductions of NxN matrices.

name                                                         old time/op             new time/op     delta
BM_cumSumRowReduction_1T/4    [using 1 threads ]             25.1ns ± 1%             35.2ns ± 1%    +40.45%
BM_cumSumRowReduction_1T/8    [using 1 threads ]             73.4ns ± 0%             82.7ns ± 3%    +12.74%
BM_cumSumRowReduction_1T/32   [using 1 threads ]              988ns ± 0%              832ns ± 0%    -15.77%
BM_cumSumRowReduction_1T/64   [using 1 threads ]             4.07µs ± 2%             3.47µs ± 0%    -14.70%
BM_cumSumRowReduction_1T/128  [using 1 threads ]             18.0µs ± 0%             16.8µs ± 0%     -6.58%
BM_cumSumRowReduction_1T/512  [using 1 threads ]              287µs ± 0%              281µs ± 0%     -2.22%
BM_cumSumRowReduction_1T/2k   [using 1 threads ]             4.78ms ± 1%             4.78ms ± 2%       ~
BM_cumSumRowReduction_1T/10k  [using 1 threads ]              117ms ± 1%              117ms ± 1%       ~
BM_cumSumRowReduction_8T/4    [using 8 threads ]             25.0ns ± 0%             35.2ns ± 0%    +40.82%
BM_cumSumRowReduction_8T/8    [using 8 threads ]             77.2ns ±16%             81.3ns ± 0%       ~
BM_cumSumRowReduction_8T/32   [using 8 threads ]              988ns ± 0%              833ns ± 0%    -15.67%
BM_cumSumRowReduction_8T/64   [using 8 threads ]             4.08µs ± 2%             3.47µs ± 0%    -14.95%
BM_cumSumRowReduction_8T/128  [using 8 threads ]             18.0µs ± 0%             17.3µs ±10%       ~
BM_cumSumRowReduction_8T/512  [using 8 threads ]              287µs ± 0%               58µs ± 6%    -79.92%
BM_cumSumRowReduction_8T/2k   [using 8 threads ]             4.79ms ± 1%             0.64ms ± 1%    -86.58%
BM_cumSumRowReduction_8T/10k  [using 8 threads ]              117ms ± 1%               18ms ± 6%    -84.50%

BM_cumSumColReduction_1T/4    [using 1 threads ]             23.9ns ± 0%             33.4ns ± 1%    +39.68%
BM_cumSumColReduction_1T/8    [using 1 threads ]             71.6ns ± 1%             49.1ns ± 3%    -31.40%
BM_cumSumColReduction_1T/32   [using 1 threads ]              973ns ± 0%              165ns ± 2%    -83.10%
BM_cumSumColReduction_1T/64   [using 1 threads ]             4.06µs ± 1%             0.57µs ± 1%    -85.94%
BM_cumSumColReduction_1T/128  [using 1 threads ]             33.4µs ± 1%              4.1µs ± 1%    -87.67%
BM_cumSumColReduction_1T/512  [using 1 threads ]             1.72ms ± 4%             0.21ms ± 5%    -87.91%
BM_cumSumColReduction_1T/2k   [using 1 threads ]              119ms ±53%               11ms ±35%    -90.42%
BM_cumSumColReduction_1T/10k  [using 1 threads ]              1.59s ±67%              0.35s ±49%    -77.96%
BM_cumSumColReduction_8T/4    [using 8 threads ]             23.8ns ± 0%             33.3ns ± 0%    +40.06%
BM_cumSumColReduction_8T/8    [using 8 threads ]             71.6ns ± 1%             49.2ns ± 5%    -31.33%
BM_cumSumColReduction_8T/32   [using 8 threads ]             1.01µs ±12%             0.17µs ± 3%    -82.93%
BM_cumSumColReduction_8T/64   [using 8 threads ]             4.15µs ± 4%             0.58µs ± 1%    -86.09%
BM_cumSumColReduction_8T/128  [using 8 threads ]             33.5µs ± 0%              4.1µs ± 4%    -87.65%
BM_cumSumColReduction_8T/512  [using 8 threads ]             1.71ms ± 3%             0.06ms ±16%    -96.21%
BM_cumSumColReduction_8T/2k   [using 8 threads ]             97.1ms ±14%              3.0ms ±23%    -96.88%
BM_cumSumColReduction_8T/10k  [using 8 threads ]              1.97s ± 8%              0.06s ± 2%    -96.74%
---
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index cee46634c..e524b535a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -190,9 +190,11 @@ struct ThreadPoolDevice {
   void parallelFor(Index n, const TensorOpCost& cost,
                    std::function<Index(Index)> block_align,
                    std::function<void(Index, Index)> f) const {
+    if (EIGEN_PREDICT_FALSE(n <= 0)){
+      return;
     // Compute small problems directly in the caller thread.
-    if (n <= 1 || numThreads() == 1 ||
-        CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+    } else if (n == 1 || numThreads() == 1 ||
+               CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
       f(0, n);
       return;
     }
-- 
cgit v1.2.3