Run two independent chains, when reducing tensors.

Running two chains exposes more instruction level parallelism, by allowing to execute both chains at the same time. Results are a bit noisy, but for medium length we almost hit theoretical upper bound of 2x. BM_fullReduction_16T/3 [using 16 threads] 17.3ns ±11% 17.4ns ± 9% ~ (p=0.178 n=18+19) BM_fullReduction_16T/4 [using 16 threads] 17.6ns ±17% 17.0ns ±18% ~ (p=0.835 n=20+19) BM_fullReduction_16T/7 [using 16 threads] 18.9ns ±12% 18.2ns ±10% ~ (p=0.756 n=20+18) BM_fullReduction_16T/8 [using 16 threads] 19.8ns ±13% 19.4ns ±21% ~ (p=0.512 n=20+20) BM_fullReduction_16T/10 [using 16 threads] 23.5ns ±15% 20.8ns ±24% -11.37% (p=0.000 n=20+19) BM_fullReduction_16T/15 [using 16 threads] 35.8ns ±21% 26.9ns ±17% -24.76% (p=0.000 n=20+19) BM_fullReduction_16T/16 [using 16 threads] 38.7ns ±22% 27.7ns ±18% -28.40% (p=0.000 n=20+19) BM_fullReduction_16T/31 [using 16 threads] 146ns ±17% 74ns ±11% -49.05% (p=0.000 n=20+18) BM_fullReduction_16T/32 [using 16 threads] 154ns ±19% 84ns ±30% -45.79% (p=0.000 n=20+19) BM_fullReduction_16T/64 [using 16 threads] 603ns ± 8% 308ns ±12% -48.94% (p=0.000 n=17+17) BM_fullReduction_16T/128 [using 16 threads] 2.44µs ±13% 1.22µs ± 1% -50.29% (p=0.000 n=17+17) BM_fullReduction_16T/256 [using 16 threads] 9.84µs ±14% 5.13µs ±30% -47.82% (p=0.000 n=19+19) BM_fullReduction_16T/512 [using 16 threads] 78.0µs ± 9% 56.1µs ±17% -28.02% (p=0.000 n=18+20) BM_fullReduction_16T/1k [using 16 threads] 325µs ± 5% 263µs ± 4% -19.00% (p=0.000 n=20+16) BM_fullReduction_16T/2k [using 16 threads] 1.09ms ± 3% 0.99ms ± 1% -9.04% (p=0.000 n=20+20) BM_fullReduction_16T/4k [using 16 threads] 7.66ms ± 3% 7.57ms ± 3% -1.24% (p=0.017 n=20+20) BM_fullReduction_16T/10k [using 16 threads] 65.3ms ± 4% 65.0ms ± 3% ~ (p=0.718 n=20+20)
author: Ilya Tokar <tokarip@google.com> 2020-06-12 17:20:42 -0400
committer: Ilya Tokar <tokarip@google.com> 2020-06-16 15:55:11 -0400
commit: 231ce21535c7bc0a145b581f823c6da00da175a9 (patch)
tree: 6c02d2a1e0a8118c81e0cdb690c9ee588fee7743
parent: a475bf14d4e79f783f3cf6285c467093a2f84f37 (diff)
1 files changed, 13 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 8332a9ae0..af9b58816 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -242,14 +242,26 @@ struct InnerMostDimReducer<Self, Op, true, true> {
       }
       return reducer.finalize(accum);
     } else {
+      const typename Self::Index UnrollSize =
+          (numValuesToReduce / (2*packetSize)) * 2*packetSize;
       const typename Self::Index VectorizedSize =
           (numValuesToReduce / packetSize) * packetSize;
       typename Self::PacketReturnType paccum =
           reducer.template initializePacket<typename Self::PacketReturnType>();
-      for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
+      typename Self::PacketReturnType paccum2 =
+          reducer.template initializePacket<typename Self::PacketReturnType>();
+      for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
         reducer.reducePacket(
             self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
+        reducer.reducePacket(
+            self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
+            &paccum2);
+      }
+      for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
+        reducer.reducePacket(self.m_impl.template packet<Unaligned>(
+                                 firstIndex + j), &paccum);
       }
+      reducer.reducePacket(paccum2, &paccum);
       for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
            ++j) {
         reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
author	Ilya Tokar <tokarip@google.com>	2020-06-12 17:20:42 -0400
committer	Ilya Tokar <tokarip@google.com>	2020-06-16 15:55:11 -0400
commit	231ce21535c7bc0a145b581f823c6da00da175a9 (patch)
tree	6c02d2a1e0a8118c81e0cdb690c9ee588fee7743
parent	a475bf14d4e79f783f3cf6285c467093a2f84f37 (diff)