From eb6cc29583f0c2ab30f612dd0bdcfad163a569b0 Mon Sep 17 00:00:00 2001 From: Ilya Tokar Date: Fri, 14 Feb 2020 16:02:57 -0500 Subject: Avoid a division in NonBlockingThreadPool::Steal. Looking at profiles we spend ~10-20% of Steal on simply computing random % size. We can reduce random 32-bit int into [0, size) range with a single multiplication and shift. This transformation is described in https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ --- unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'unsupported') diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 9353f41e2..43a274651 100644 --- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -335,8 +335,12 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface { PerThread* pt = GetPerThread(); const size_t size = limit - start; unsigned r = Rand(&pt->rand); - unsigned victim = r % size; - unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()]; + // Reduce r into [0, size) range, this utilizes trick from + // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ + eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30)); + unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32; + unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32; + unsigned inc = all_coprimes_[size - 1][index]; for (unsigned i = 0; i < size; i++) { eigen_plain_assert(start + victim < limit); -- cgit v1.2.3