Speedup Tensor ThreadPool RunQueu::Empty()

author: Eugene Zhulenev <ezhulenev@google.com> 2019-02-13 10:20:53 -0800
committer: Eugene Zhulenev <ezhulenev@google.com> 2019-02-13 10:20:53 -0800
commit: 8c2f30c7900b8e91df6d044431f4b5cc667993c3 (patch)
tree: 97d9103f494ea931d665040f5791567ac4cba2fd /unsupported/Eigen/CXX11/src/ThreadPool
parent: bdcb5f33043c559c7100f8fd5eb55fbbd0cdfc69 (diff)
1 files changed, 28 insertions, 14 deletions
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 73928c1d4..993d9e066 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -148,32 +148,46 @@ class RunQueue {
     return n;
   }
 
-  // Size returns current queue size.
+  // Size returns current queue size; if NeedSizeEstimate is false, only whether
+  // the size is 0 is guaranteed to be correct.
   // Can be called by any thread at any time.
-  unsigned Size() const {
+  template <bool NeedSizeEstimate>
+  unsigned SizeOrNotEmpty() const {
     // Emptiness plays critical role in thread pool blocking. So we go to great
     // effort to not produce false positives (claim non-empty queue as empty).
+    unsigned front = front_.load(std::memory_order_acquire);
     for (;;) {
       // Capture a consistent snapshot of front/tail.
-      unsigned front = front_.load(std::memory_order_acquire);
       unsigned back = back_.load(std::memory_order_acquire);
       unsigned front1 = front_.load(std::memory_order_relaxed);
-      if (front != front1) continue;
-      int size = (front & kMask2) - (back & kMask2);
-      // Fix overflow.
-      if (size < 0) size += 2 * kSize;
-      // Order of modification in push/pop is crafted to make the queue look
-      // larger than it is during concurrent modifications. E.g. pop can
-      // decrement size before the corresponding push has incremented it.
-      // So the computed size can be up to kSize + 1, fix it.
-      if (size > static_cast<int>(kSize)) size = kSize;
-      return size;
+      if (front != front1) {
+        front = front1;
+        std::atomic_thread_fence(std::memory_order_acquire);
+        continue;
+      }
+      if (NeedSizeEstimate) {
+        int size = (front & kMask2) - (back & kMask2);
+        // Fix overflow.
+        if (size < 0) size += 2 * kSize;
+        // Order of modification in push/pop is crafted to make the queue look
+        // larger than it is during concurrent modifications. E.g. pop can
+        // decrement size before the corresponding push has incremented it.
+        // So the computed size can be up to kSize + 1, fix it.
+        if (size > kSize) size = kSize;
+        return size;
+      } else {
+        return ((front ^ back) & kMask2);
+      }
     }
   }
 
+  // Size returns current queue size.
+  // Can be called by any thread at any time.
+  unsigned Size() const { return SizeOrNotEmpty<true>(); }
+
   // Empty tests whether container is empty.
   // Can be called by any thread at any time.
-  bool Empty() const { return Size() == 0; }
+  bool Empty() const { return SizeOrNotEmpty<false>() == 0; }
 
   // Delete all the elements from the queue.
   void Flush() {
author	Eugene Zhulenev <ezhulenev@google.com>	2019-02-13 10:20:53 -0800
committer	Eugene Zhulenev <ezhulenev@google.com>	2019-02-13 10:20:53 -0800
commit	8c2f30c7900b8e91df6d044431f4b5cc667993c3 (patch)
tree	97d9103f494ea931d665040f5791567ac4cba2fd /unsupported/Eigen/CXX11/src/ThreadPool
parent	bdcb5f33043c559c7100f8fd5eb55fbbd0cdfc69 (diff)