aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Ilya Tokar <tokarip@google.com>2020-02-14 16:02:57 -0500
committerGravatar Ilya Tokar <tokarip@google.com>2020-02-14 16:02:57 -0500
commiteb6cc29583f0c2ab30f612dd0bdcfad163a569b0 (patch)
treeb8e75877a0e5489838477bea1fff0bd647ae0e58 /unsupported
parent776960024585b907acc4abc3c59aef605941bb75 (diff)
Avoid a division in NonBlockingThreadPool::Steal.
Looking at profiles we spend ~10-20% of Steal on simply computing random % size. We can reduce random 32-bit int into [0, size) range with a single multiplication and shift. This transformation is described in https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h8
1 files changed, 6 insertions, 2 deletions
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 9353f41e2..43a274651 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -335,8 +335,12 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
PerThread* pt = GetPerThread();
const size_t size = limit - start;
unsigned r = Rand(&pt->rand);
- unsigned victim = r % size;
- unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
+ // Reduce r into [0, size) range, this utilizes trick from
+ // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+ eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
+ unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
+ unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
+ unsigned inc = all_coprimes_[size - 1][index];
for (unsigned i = 0; i < size; i++) {
eigen_plain_assert(start + victim < limit);