aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc')
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc74
1 files changed, 71 insertions, 3 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 4898448476..3c1c31aa73 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -15,11 +15,80 @@ limitations under the License.
#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/platform/stacktrace.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/protobuf/config.pb.h"
namespace tensorflow {
+namespace {
+// The EventMgr has 1 thread for the polling loop and one to execute
+// event callback functions. Issues for reconsideration:
+// - Is this the right number of threads?
+// - Should EventMgrs be shared between GPUDevices on a multi-GPU machine?
+static const int kNumThreads = 2;
+} // namespace
+
+namespace gpu_event_mgr {
+class ThreadLabel {
+ public:
+ static const char* GetValue() { return value_; }
+
+ // v must be a static const because value_ will capture and use its value
+ // until reset or thread terminates.
+ static void SetValue(const char* v) { value_ = v; }
+
+ private:
+ static thread_local const char* value_;
+};
+thread_local const char* ThreadLabel::value_ = "";
+
+void WarnIfInCallback(std::function<void()> f) {
+ const char* label = ThreadLabel::GetValue();
+ if (label && !strcmp(label, "gpu_event_mgr")) {
+ if (f) {
+ f();
+ } else {
+ LOG(WARNING) << "Executing inside EventMgr callback thread: "
+ << CurrentStackTrace();
+ }
+ }
+}
+
+void InitThreadpoolLabels(thread::ThreadPool* threadpool) {
+ static const char* label = "gpu_event_mgr";
+ mutex mu;
+ int init_count = 0;
+ condition_variable all_initialized;
+ int exit_count = 0;
+ condition_variable ready_to_exit;
+ const int num_threads = threadpool->NumThreads();
+ for (int i = 0; i < num_threads; ++i) {
+ threadpool->Schedule([num_threads, &mu, &init_count, &all_initialized,
+ &exit_count, &ready_to_exit]() {
+ gpu_event_mgr::ThreadLabel::SetValue(label);
+ mutex_lock l(mu);
+ ++init_count;
+ if (init_count == num_threads) {
+ all_initialized.notify_all();
+ }
+ while (init_count < num_threads) {
+ all_initialized.wait(l);
+ }
+ if (++exit_count == num_threads) {
+ ready_to_exit.notify_all();
+ }
+ });
+ }
+ {
+ mutex_lock l(mu);
+ while (exit_count < num_threads) {
+ ready_to_exit.wait(l);
+ }
+ }
+}
+} // namespace gpu_event_mgr
+
EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
: exec_(se),
deferred_bytes_threshold_(gpu_options.deferred_deletion_bytes()
@@ -31,9 +100,8 @@ EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
accumulated_stream_(nullptr),
accumulated_tensors_(new TensorReferenceVector),
accumulated_tensor_bytes_(0),
- // threadpool_ has 1 thread for the polling loop, and one to execute
- // event callback functions. Maybe we should have more?
- threadpool_(Env::Default(), "GPU_Event_Manager", 2) {
+ threadpool_(Env::Default(), "GPU_Event_Manager", kNumThreads) {
+ gpu_event_mgr::InitThreadpoolLabels(&threadpool_);
StartPollingLoop();
}