Eliminate gpu_allocator_retry_test flakiness by no longer relying on real

race conditions for testing, but instead forcing parallel threads to strictly alternate so that the interesting 'race' conditions deterministically arise. Change: 124560962
author: A. Unique TensorFlower <nobody@tensorflow.org> 2016-06-10 06:59:30 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-06-10 08:03:51 -0700
commit: 35f66e516a02aa46ae0380d66c5a17e3f42b9504 (patch)
tree: b38e70b8aead32e7d16b7c1cc084b3b69bf9250c /tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
parent: b5012c50a3373d5388dc7619cc4b181fb2f4fa57 (diff)
1 files changed, 66 insertions, 9 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
index 777f578277..2148f83fe5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
@@ -62,11 +62,66 @@ class FakeAllocator {
   int millis_to_wait_;
 };
 
+// GPUAllocatorRetry is a mechanism to deal with race conditions which
+// are inevitable in the TensorFlow runtime where parallel Nodes can
+// execute in any order.  Properly testing this feature would use real
+// multi-threaded race conditions, but that leads to flaky tests as
+// the expected outcome fails to occur with low but non-zero
+// probability.  To make these tests reliable we simulate real race
+// conditions by forcing parallel threads to take turns in the
+// interesting part of their interaction with the allocator.  This
+// class is the mechanism that imposes turn taking.
+class AlternatingBarrier {
+ public:
+  explicit AlternatingBarrier(int num_users)
+      : num_users_(num_users), next_turn_(0), done_(num_users, false) {}
+
+  void WaitTurn(int user_index) {
+    mutex_lock l(mu_);
+    int wait_cycles = 0;
+    // A user is allowed to proceed out of turn if it waits too long.
+    while (next_turn_ != user_index && wait_cycles++ < 10) {
+      cv_.wait_for(l, std::chrono::milliseconds(1));
+    }
+    if (next_turn_ == user_index) {
+      IncrementTurn();
+      cv_.notify_all();
+    }
+  }
+
+  // When a user quits, stop reserving it a turn.
+  void Done(int user_index) {
+    mutex_lock l(mu_);
+    done_[user_index] = true;
+    if (next_turn_ == user_index) {
+      IncrementTurn();
+      cv_.notify_all();
+    }
+  }
+
+ private:
+  void IncrementTurn() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    int skipped = 0;
+    while (skipped < num_users_) {
+      next_turn_ = (next_turn_ + 1) % num_users_;
+      if (!done_[next_turn_]) return;
+      ++skipped;
+    }
+  }
+
+  mutex mu_;
+  condition_variable cv_;
+  int num_users_;
+  int next_turn_ GUARDED_BY(mu_);
+  std::vector<bool> done_ GUARDED_BY(mu_);
+};
+
 class GPUAllocatorRetryTest : public ::testing::Test {
  protected:
   GPUAllocatorRetryTest() {}
 
   void LaunchConsumerThreads(int num_consumers, int cap_needed) {
+    barrier_.reset(new AlternatingBarrier(num_consumers));
     consumer_count_.resize(num_consumers, 0);
     for (int i = 0; i < num_consumers; ++i) {
       consumers_.push_back(Env::Default()->StartThread(
@@ -74,21 +129,22 @@ class GPUAllocatorRetryTest : public ::testing::Test {
             do {
               void* ptr = nullptr;
               for (int j = 0; j < cap_needed; ++j) {
+                barrier_->WaitTurn(i);
                 ptr = alloc_->AllocateRaw(16, 1);
                 if (ptr == nullptr) {
                   mutex_lock l(mu_);
                   has_failed_ = true;
+                  barrier_->Done(i);
                   return;
                 }
               }
-              // Failures are more likely to occur if each consumer
-              // delays for a while before returning the memory.
-              Env::Default()->SleepForMicroseconds(500);
               ++consumer_count_[i];
               for (int j = 0; j < cap_needed; ++j) {
+                barrier_->WaitTurn(i);
                 alloc_->DeallocateRaw(ptr);
               }
             } while (!notifier_.HasBeenNotified());
+            barrier_->Done(i);
           }));
     }
   }
@@ -113,6 +169,7 @@ class GPUAllocatorRetryTest : public ::testing::Test {
   }
 
   std::unique_ptr<FakeAllocator> alloc_;
+  std::unique_ptr<AlternatingBarrier> barrier_;
   std::vector<Thread*> consumers_;
   std::vector<int> consumer_count_;
   Notification notifier_;
@@ -124,9 +181,9 @@ class GPUAllocatorRetryTest : public ::testing::Test {
 // Verifies correct retrying when memory is slightly overcommitted but
 // we allow retry.
 TEST_F(GPUAllocatorRetryTest, RetrySuccess) {
-  // Support up to 2 allocations simultaneously, waits up to 10 msec for
+  // Support up to 2 allocations simultaneously, waits up to 1000 msec for
   // a chance to alloc.
-  alloc_.reset(new FakeAllocator(2, 10000));
+  alloc_.reset(new FakeAllocator(2, 1000));
   // Launch 3 consumers, each of whom needs 1 unit at a time.
   LaunchConsumerThreads(3, 1);
   // This should be enough time for each consumer to be satisfied many times.
@@ -170,16 +227,16 @@ TEST_F(GPUAllocatorRetryTest, NoRetryFail) {
 // Verifies OutOfMemory failure when retry is allowed but memory capacity
 // is too low even for retry.
 TEST_F(GPUAllocatorRetryTest, RetryInsufficientFail) {
-  // Support up to 2 allocations simultaneously, waits up to 10 msec for
+  // Support up to 2 allocations simultaneously, waits up to 1000 msec for
   // a chance to alloc.
-  alloc_.reset(new FakeAllocator(2, 10000));
+  alloc_.reset(new FakeAllocator(2, 1000));
   // Launch 3 consumers, each of whom needs 2 units at a time.  We expect
   // deadlock where 2 consumers each hold 1 unit, and timeout trying to
   // get the second.
   LaunchConsumerThreads(3, 2);
   Env::Default()->SleepForMicroseconds(50000);
-  // Will wait up to 10 seconds for proper race condition to occur, resulting
-  // in failure.
+  // We're forcing a race condition, so this will fail quickly, but
+  // give it 10 seconds anyway.
   JoinConsumerThreads(true, 10000000);
   for (int i = 0; i < 3; ++i) {
     LOG(INFO) << "Consumer " << i << " is " << consumer_count_[i];
author	A. Unique TensorFlower <nobody@tensorflow.org>	2016-06-10 06:59:30 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-06-10 08:03:51 -0700
commit	35f66e516a02aa46ae0380d66c5a17e3f42b9504 (patch)
tree	b38e70b8aead32e7d16b7c1cc084b3b69bf9250c /tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
parent	b5012c50a3373d5388dc7619cc4b181fb2f4fa57 (diff)