From 231ce21535c7bc0a145b581f823c6da00da175a9 Mon Sep 17 00:00:00 2001 From: Ilya Tokar Date: Fri, 12 Jun 2020 17:20:42 -0400 Subject: Run two independent chains, when reducing tensors. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Running two chains exposes more instruction level parallelism, by allowing to execute both chains at the same time. Results are a bit noisy, but for medium length we almost hit theoretical upper bound of 2x. BM_fullReduction_16T/3 [using 16 threads] 17.3ns ±11% 17.4ns ± 9% ~ (p=0.178 n=18+19) BM_fullReduction_16T/4 [using 16 threads] 17.6ns ±17% 17.0ns ±18% ~ (p=0.835 n=20+19) BM_fullReduction_16T/7 [using 16 threads] 18.9ns ±12% 18.2ns ±10% ~ (p=0.756 n=20+18) BM_fullReduction_16T/8 [using 16 threads] 19.8ns ±13% 19.4ns ±21% ~ (p=0.512 n=20+20) BM_fullReduction_16T/10 [using 16 threads] 23.5ns ±15% 20.8ns ±24% -11.37% (p=0.000 n=20+19) BM_fullReduction_16T/15 [using 16 threads] 35.8ns ±21% 26.9ns ±17% -24.76% (p=0.000 n=20+19) BM_fullReduction_16T/16 [using 16 threads] 38.7ns ±22% 27.7ns ±18% -28.40% (p=0.000 n=20+19) BM_fullReduction_16T/31 [using 16 threads] 146ns ±17% 74ns ±11% -49.05% (p=0.000 n=20+18) BM_fullReduction_16T/32 [using 16 threads] 154ns ±19% 84ns ±30% -45.79% (p=0.000 n=20+19) BM_fullReduction_16T/64 [using 16 threads] 603ns ± 8% 308ns ±12% -48.94% (p=0.000 n=17+17) BM_fullReduction_16T/128 [using 16 threads] 2.44µs ±13% 1.22µs ± 1% -50.29% (p=0.000 n=17+17) BM_fullReduction_16T/256 [using 16 threads] 9.84µs ±14% 5.13µs ±30% -47.82% (p=0.000 n=19+19) BM_fullReduction_16T/512 [using 16 threads] 78.0µs ± 9% 56.1µs ±17% -28.02% (p=0.000 n=18+20) BM_fullReduction_16T/1k [using 16 threads] 325µs ± 5% 263µs ± 4% -19.00% (p=0.000 n=20+16) BM_fullReduction_16T/2k [using 16 threads] 1.09ms ± 3% 0.99ms ± 1% -9.04% (p=0.000 n=20+20) BM_fullReduction_16T/4k [using 16 threads] 7.66ms ± 3% 7.57ms ± 3% -1.24% (p=0.017 n=20+20) BM_fullReduction_16T/10k [using 16 threads] 65.3ms ± 4% 65.0ms ± 3% ~ (p=0.718 n=20+20) --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'unsupported') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 8332a9ae0..af9b58816 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -242,14 +242,26 @@ struct InnerMostDimReducer { } return reducer.finalize(accum); } else { + const typename Self::Index UnrollSize = + (numValuesToReduce / (2*packetSize)) * 2*packetSize; const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; typename Self::PacketReturnType paccum = reducer.template initializePacket(); - for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { + typename Self::PacketReturnType paccum2 = + reducer.template initializePacket(); + for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) { reducer.reducePacket( self.m_impl.template packet(firstIndex + j), &paccum); + reducer.reducePacket( + self.m_impl.template packet(firstIndex + j + packetSize), + &paccum2); + } + for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) { + reducer.reducePacket(self.m_impl.template packet( + firstIndex + j), &paccum); } + reducer.reducePacket(paccum2, &paccum); for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); -- cgit v1.2.3