aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/cpp/thread_manager
diff options
context:
space:
mode:
authorGravatar Sree Kuchibhotla <sreek@google.com>2017-08-09 20:40:24 -0700
committerGravatar Sree Kuchibhotla <sreek@google.com>2017-08-09 20:40:24 -0700
commit419b617a95d419af193ff6b9d540ec6e22ed8c1f (patch)
tree4e96cc6ce8b7a3fe157fed95b148de6a8b055f5d /src/cpp/thread_manager
parenta27680b597111bc3ddd1272a21be6d35a296b349 (diff)
Fix thread avalances in thread manager
Diffstat (limited to 'src/cpp/thread_manager')
-rw-r--r--src/cpp/thread_manager/thread_manager.cc38
1 files changed, 33 insertions, 5 deletions
diff --git a/src/cpp/thread_manager/thread_manager.cc b/src/cpp/thread_manager/thread_manager.cc
index 3610aa9013..23264f1b5b 100644
--- a/src/cpp/thread_manager/thread_manager.cc
+++ b/src/cpp/thread_manager/thread_manager.cc
@@ -158,11 +158,39 @@ void ThreadManager::MainWorkLoop() {
}
// If we decided to finish the thread, break out of the while loop
if (done) break;
- // ... otherwise increase poller count and continue
- // There's a chance that we'll exceed the max poller count: that is
- // explicitly ok - we'll decrease after one poll timeout, and prevent
- // some thrashing starting up and shutting down threads
- num_pollers_++;
+
+ // Otherwise go back to polling as long as it doesn't exceed max_pollers_
+ //
+ // **WARNING**:
+ // There is a possibility of threads thrashing here (i.e excessive thread
+ // shutdowns and creations than the ideal case). This happens if max_poller_
+ // count is small and the rate of incoming requests is also small. In such
+ // scenarios we can possibly configure max_pollers_ to a higher value and/or
+ // increase the cq timeout.
+ //
+ // However, not doing this check here and unconditionally incrementing
+ // num_pollers (and hoping that the system will eventually settle down) has
+ // far worse consequences i.e huge number of threads getting created to the
+ // point of thread-exhaustion. For example: if the incoming request rate is
+ // very high, all the polling threads will return very quickly from
+ // PollForWork() with WORK_FOUND. They all briefly decrement num_pollers_
+ // counter thereby possibly - and briefly - making it go below min_pollers;
+ // This will most likely result in the creation of a new poller since
+ // num_pollers_ dipped below min_pollers_.
+ //
+ // Now, If we didn't do the max_poller_ check here, all these threads will
+ // go back to doing PollForWork() and the whole cycle repeats (with a new
+ // thread being added in each cycle). Once the total number of threads in
+ // the system crosses a certain threshold (around ~1500), there is heavy
+ // contention on mutexes (the mu_ here or the mutexes in gRPC core like the
+ // pollset mutex) that makes DoWork() take longer to finish thereby causing
+ // new poller threads to be created even faster. This results in a thread
+ // avalanche.
+ if (num_pollers_ < max_pollers_) {
+ num_pollers_++;
+ } else {
+ break;
+ }
};
CleanupCompletedThreads();