aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <nobody@tensorflow.org>2016-06-10 06:59:30 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-06-10 08:03:51 -0700
commit35f66e516a02aa46ae0380d66c5a17e3f42b9504 (patch)
treeb38e70b8aead32e7d16b7c1cc084b3b69bf9250c /tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
parentb5012c50a3373d5388dc7619cc4b181fb2f4fa57 (diff)
Eliminate gpu_allocator_retry_test flakiness by no longer relying on real
race conditions for testing, but instead forcing parallel threads to strictly alternate so that the interesting 'race' conditions deterministically arise. Change: 124560962
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc')
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc75
1 files changed, 66 insertions, 9 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
index 777f578277..2148f83fe5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
@@ -62,11 +62,66 @@ class FakeAllocator {
int millis_to_wait_;
};
+// GPUAllocatorRetry is a mechanism to deal with race conditions which
+// are inevitable in the TensorFlow runtime where parallel Nodes can
+// execute in any order. Properly testing this feature would use real
+// multi-threaded race conditions, but that leads to flaky tests as
+// the expected outcome fails to occur with low but non-zero
+// probability. To make these tests reliable we simulate real race
+// conditions by forcing parallel threads to take turns in the
+// interesting part of their interaction with the allocator. This
+// class is the mechanism that imposes turn taking.
+class AlternatingBarrier {
+ public:
+ explicit AlternatingBarrier(int num_users)
+ : num_users_(num_users), next_turn_(0), done_(num_users, false) {}
+
+ void WaitTurn(int user_index) {
+ mutex_lock l(mu_);
+ int wait_cycles = 0;
+ // A user is allowed to proceed out of turn if it waits too long.
+ while (next_turn_ != user_index && wait_cycles++ < 10) {
+ cv_.wait_for(l, std::chrono::milliseconds(1));
+ }
+ if (next_turn_ == user_index) {
+ IncrementTurn();
+ cv_.notify_all();
+ }
+ }
+
+ // When a user quits, stop reserving it a turn.
+ void Done(int user_index) {
+ mutex_lock l(mu_);
+ done_[user_index] = true;
+ if (next_turn_ == user_index) {
+ IncrementTurn();
+ cv_.notify_all();
+ }
+ }
+
+ private:
+ void IncrementTurn() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ int skipped = 0;
+ while (skipped < num_users_) {
+ next_turn_ = (next_turn_ + 1) % num_users_;
+ if (!done_[next_turn_]) return;
+ ++skipped;
+ }
+ }
+
+ mutex mu_;
+ condition_variable cv_;
+ int num_users_;
+ int next_turn_ GUARDED_BY(mu_);
+ std::vector<bool> done_ GUARDED_BY(mu_);
+};
+
class GPUAllocatorRetryTest : public ::testing::Test {
protected:
GPUAllocatorRetryTest() {}
void LaunchConsumerThreads(int num_consumers, int cap_needed) {
+ barrier_.reset(new AlternatingBarrier(num_consumers));
consumer_count_.resize(num_consumers, 0);
for (int i = 0; i < num_consumers; ++i) {
consumers_.push_back(Env::Default()->StartThread(
@@ -74,21 +129,22 @@ class GPUAllocatorRetryTest : public ::testing::Test {
do {
void* ptr = nullptr;
for (int j = 0; j < cap_needed; ++j) {
+ barrier_->WaitTurn(i);
ptr = alloc_->AllocateRaw(16, 1);
if (ptr == nullptr) {
mutex_lock l(mu_);
has_failed_ = true;
+ barrier_->Done(i);
return;
}
}
- // Failures are more likely to occur if each consumer
- // delays for a while before returning the memory.
- Env::Default()->SleepForMicroseconds(500);
++consumer_count_[i];
for (int j = 0; j < cap_needed; ++j) {
+ barrier_->WaitTurn(i);
alloc_->DeallocateRaw(ptr);
}
} while (!notifier_.HasBeenNotified());
+ barrier_->Done(i);
}));
}
}
@@ -113,6 +169,7 @@ class GPUAllocatorRetryTest : public ::testing::Test {
}
std::unique_ptr<FakeAllocator> alloc_;
+ std::unique_ptr<AlternatingBarrier> barrier_;
std::vector<Thread*> consumers_;
std::vector<int> consumer_count_;
Notification notifier_;
@@ -124,9 +181,9 @@ class GPUAllocatorRetryTest : public ::testing::Test {
// Verifies correct retrying when memory is slightly overcommitted but
// we allow retry.
TEST_F(GPUAllocatorRetryTest, RetrySuccess) {
- // Support up to 2 allocations simultaneously, waits up to 10 msec for
+ // Support up to 2 allocations simultaneously, waits up to 1000 msec for
// a chance to alloc.
- alloc_.reset(new FakeAllocator(2, 10000));
+ alloc_.reset(new FakeAllocator(2, 1000));
// Launch 3 consumers, each of whom needs 1 unit at a time.
LaunchConsumerThreads(3, 1);
// This should be enough time for each consumer to be satisfied many times.
@@ -170,16 +227,16 @@ TEST_F(GPUAllocatorRetryTest, NoRetryFail) {
// Verifies OutOfMemory failure when retry is allowed but memory capacity
// is too low even for retry.
TEST_F(GPUAllocatorRetryTest, RetryInsufficientFail) {
- // Support up to 2 allocations simultaneously, waits up to 10 msec for
+ // Support up to 2 allocations simultaneously, waits up to 1000 msec for
// a chance to alloc.
- alloc_.reset(new FakeAllocator(2, 10000));
+ alloc_.reset(new FakeAllocator(2, 1000));
// Launch 3 consumers, each of whom needs 2 units at a time. We expect
// deadlock where 2 consumers each hold 1 unit, and timeout trying to
// get the second.
LaunchConsumerThreads(3, 2);
Env::Default()->SleepForMicroseconds(50000);
- // Will wait up to 10 seconds for proper race condition to occur, resulting
- // in failure.
+ // We're forcing a race condition, so this will fail quickly, but
+ // give it 10 seconds anyway.
JoinConsumerThreads(true, 10000000);
for (int i = 0; i < 3; ++i) {
LOG(INFO) << "Consumer " << i << " is " << consumer_count_[i];