aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Ilya Tokar <tokarip@google.com>2020-06-12 17:20:42 -0400
committerGravatar Ilya Tokar <tokarip@google.com>2020-06-16 15:55:11 -0400
commit231ce21535c7bc0a145b581f823c6da00da175a9 (patch)
tree6c02d2a1e0a8118c81e0cdb690c9ee588fee7743
parenta475bf14d4e79f783f3cf6285c467093a2f84f37 (diff)
Run two independent chains, when reducing tensors.
Running two chains exposes more instruction level parallelism, by allowing to execute both chains at the same time. Results are a bit noisy, but for medium length we almost hit theoretical upper bound of 2x. BM_fullReduction_16T/3 [using 16 threads] 17.3ns ±11% 17.4ns ± 9% ~ (p=0.178 n=18+19) BM_fullReduction_16T/4 [using 16 threads] 17.6ns ±17% 17.0ns ±18% ~ (p=0.835 n=20+19) BM_fullReduction_16T/7 [using 16 threads] 18.9ns ±12% 18.2ns ±10% ~ (p=0.756 n=20+18) BM_fullReduction_16T/8 [using 16 threads] 19.8ns ±13% 19.4ns ±21% ~ (p=0.512 n=20+20) BM_fullReduction_16T/10 [using 16 threads] 23.5ns ±15% 20.8ns ±24% -11.37% (p=0.000 n=20+19) BM_fullReduction_16T/15 [using 16 threads] 35.8ns ±21% 26.9ns ±17% -24.76% (p=0.000 n=20+19) BM_fullReduction_16T/16 [using 16 threads] 38.7ns ±22% 27.7ns ±18% -28.40% (p=0.000 n=20+19) BM_fullReduction_16T/31 [using 16 threads] 146ns ±17% 74ns ±11% -49.05% (p=0.000 n=20+18) BM_fullReduction_16T/32 [using 16 threads] 154ns ±19% 84ns ±30% -45.79% (p=0.000 n=20+19) BM_fullReduction_16T/64 [using 16 threads] 603ns ± 8% 308ns ±12% -48.94% (p=0.000 n=17+17) BM_fullReduction_16T/128 [using 16 threads] 2.44µs ±13% 1.22µs ± 1% -50.29% (p=0.000 n=17+17) BM_fullReduction_16T/256 [using 16 threads] 9.84µs ±14% 5.13µs ±30% -47.82% (p=0.000 n=19+19) BM_fullReduction_16T/512 [using 16 threads] 78.0µs ± 9% 56.1µs ±17% -28.02% (p=0.000 n=18+20) BM_fullReduction_16T/1k [using 16 threads] 325µs ± 5% 263µs ± 4% -19.00% (p=0.000 n=20+16) BM_fullReduction_16T/2k [using 16 threads] 1.09ms ± 3% 0.99ms ± 1% -9.04% (p=0.000 n=20+20) BM_fullReduction_16T/4k [using 16 threads] 7.66ms ± 3% 7.57ms ± 3% -1.24% (p=0.017 n=20+20) BM_fullReduction_16T/10k [using 16 threads] 65.3ms ± 4% 65.0ms ± 3% ~ (p=0.718 n=20+20)
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h14
1 files changed, 13 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 8332a9ae0..af9b58816 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -242,14 +242,26 @@ struct InnerMostDimReducer<Self, Op, true, true> {
}
return reducer.finalize(accum);
} else {
+ const typename Self::Index UnrollSize =
+ (numValuesToReduce / (2*packetSize)) * 2*packetSize;
const typename Self::Index VectorizedSize =
(numValuesToReduce / packetSize) * packetSize;
typename Self::PacketReturnType paccum =
reducer.template initializePacket<typename Self::PacketReturnType>();
- for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
+ typename Self::PacketReturnType paccum2 =
+ reducer.template initializePacket<typename Self::PacketReturnType>();
+ for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
reducer.reducePacket(
self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
+ reducer.reducePacket(
+ self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
+ &paccum2);
+ }
+ for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
+ reducer.reducePacket(self.m_impl.template packet<Unaligned>(
+ firstIndex + j), &paccum);
}
+ reducer.reducePacket(paccum2, &paccum);
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
++j) {
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);