From aab3c53e1484404a70565324d1231c4e6ead7425 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Mon, 24 Sep 2018 14:34:57 -0700 Subject: Inline kernel tracing logic into `ExecutorState::Process()`. All devices implement the same tracing logic in an override of `Device::Compute()`. However, that logic does not have access to the cached `NodeItem::kernel_is_expensive` bit for the kernel, so it must make a virtual call to `OpKernel::IsExpensive()`. By inlining the logic into `ExecutorState::Process()`, we avoid making an unnecessary virtual call on each kernel invocation (when a trace controller is attached). PiperOrigin-RevId: 214332492 --- tensorflow/core/common_runtime/device.h | 6 +++ tensorflow/core/common_runtime/executor.cc | 62 +++++++++++++++++++++++- tensorflow/core/common_runtime/gpu/gpu_device.cc | 5 -- tensorflow/core/common_runtime/gpu/gpu_device.h | 5 ++ tensorflow/core/common_runtime/local_device.cc | 2 +- tensorflow/core/common_runtime/local_device.h | 3 +- tensorflow/core/common_runtime/tracing_device.h | 60 ----------------------- 7 files changed, 74 insertions(+), 69 deletions(-) delete mode 100644 tensorflow/core/common_runtime/tracing_device.h (limited to 'tensorflow/core/common_runtime') diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h index fb76d6ac29..2ef1547cd9 100644 --- a/tensorflow/core/common_runtime/device.h +++ b/tensorflow/core/common_runtime/device.h @@ -101,6 +101,12 @@ class Device : public DeviceBase { } } + // If true, and tracing is enabled, the `tracing::ScopedAnnotation()` tracing + // mechanism will be used instead of `tracing::ScopedActivity()`. Some devices + // may override this method to use annotations, which enable child activities + // (such as GPU kernel launches) to be related to the OpKernel invocation. + virtual bool TraceUsingAnnotations() const { return false; } + // Blocks until all operations queued on the device at the time of // the call have completed. Returns any error pending on the device // at completion. diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc index 7cef34ac52..2c48084cab 100644 --- a/tensorflow/core/common_runtime/executor.cc +++ b/tensorflow/core/common_runtime/executor.cc @@ -1238,6 +1238,9 @@ class ExecutorState { // Step-local container. ScopedStepContainer* step_container_; StepStatsCollectorInterface* const stats_collector_; + const tracing::TraceCollector* const trace_collector_; + const tracing::EventCollector* const event_collector_; + // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper // instead of a pointer? (avoids having to delete). checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache_; @@ -1246,6 +1249,7 @@ class ExecutorState { CancellationManager* cancellation_manager_; Executor::Args::Runner runner_; bool sync_on_finish_; + const bool trace_using_annotations_; // Owned. @@ -1360,12 +1364,16 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl) tensor_store_(args.tensor_store), step_container_(args.step_container), stats_collector_(args.stats_collector), + trace_collector_(tracing::GetTraceCollector()), + event_collector_( + tracing::GetEventCollector(tracing::EventCategory::kCompute)), slice_reader_cache_(new checkpoint::TensorSliceReaderCacheWrapper), call_frame_(args.call_frame), impl_(impl), cancellation_manager_(args.cancellation_manager), runner_(args.runner), sync_on_finish_(args.sync_on_finish), + trace_using_annotations_(impl->params_.device->TraceUsingAnnotations()), num_outstanding_ops_(0) { // We start the entire execution in iteration 0 of the root frame // so let us create the root frame and the state for iteration 0. @@ -1551,6 +1559,32 @@ struct ExecutorState::AsyncState { } }; +// Returns true if `item` might be traced by the given trace and event +// collectors. Returns false only if `item` definitely will not be traced. +bool MightTrace(const NodeItem& item, + const tracing::TraceCollector* trace_collector, + const tracing::EventCollector* event_collector, + bool using_annotations) { + // Tracing will only be enabled if either `event_collector` is non null, + // or `trace_collector` is non-null and enabled for this particular kernel. + // Although `tracing::ScopedActivity`, + // `tracing::ScopedAnnotation`, and `tracing::ScopedRegion` check subsets of + // these properties internally in their constructors, the cost of passing the + // necessary arguments to them can be significant, so we avoid constructing + // them in the common case (when we know they will not be used). + if (event_collector != nullptr) { + return true; + } + if (trace_collector) { + if (using_annotations) { + return trace_collector->IsEnabledForAnnotations(); + } else { + return trace_collector->IsEnabledForActivities(item.kernel_is_expensive); + } + } + return false; +} + void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) { const GraphView& gview = impl_->gview_; TaggedNodeSeq ready; @@ -1585,6 +1619,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) { Status s; NodeExecStatsInterface* stats = nullptr; + EntryVector outputs; bool completed = false; inline_ready.push_back(tagged_node); @@ -1721,7 +1756,32 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) { // Synchronous computes. OpKernelContext ctx(¶ms, item.num_outputs); nodestats::SetOpStart(stats); - device->Compute(CHECK_NOTNULL(op_kernel), &ctx); + + if (TF_PREDICT_FALSE(MightTrace(item, trace_collector_, + event_collector_, + trace_using_annotations_))) { + const string& op_name = op_kernel->name(); + tracing::ScopedRegion region(tracing::EventCategory::kCompute, + op_name); + if (trace_using_annotations_) { + // The OpKernel may create child activities (such as GPU kernel + // launches), so use a `ScopedAnnotation` to relate these activities + // in the trace. + tracing::ScopedAnnotation activity(op_name, + op_kernel->type_string()); + device->Compute(op_kernel, &ctx); + } else { + // Use the cheaper `ScopedActivity` to trace just the OpKernel + // execution. + tracing::ScopedActivity activity(op_name, op_kernel->type_string(), + item.kernel_is_expensive); + device->Compute(op_kernel, &ctx); + } + } else { + // In the common case, avoid creating any tracing objects. + device->Compute(op_kernel, &ctx); + } + nodestats::SetOpEnd(stats); s = ProcessOutputs(item, &ctx, &outputs, stats); if (s.ok() && impl_->device_record_tensor_accesses_) { diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index cf3faf68ff..d8ebdeff5d 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -434,9 +434,6 @@ Status BaseGPUDevice::FillContextMap(const Graph* graph, } void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { - tracing::ScopedRegion region(tracing::EventCategory::kCompute, - op_kernel->name()); - // NOTE(tucker): We need to discriminate between Eigen GPU // operations and all others. If an operation is Eigen // implemented (or otherwise tries to launch a cuda kernel @@ -450,8 +447,6 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { context->SetStatus(errors::Internal( "Invalid synchronous 'Compute' on GPU for '_Recv' op")); } else { - tracing::ScopedAnnotation annotation(op_kernel->name(), - op_kernel->type_string()); ComputeHelper(op_kernel, context); } } diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h index b25fe8645f..674e8384d5 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.h +++ b/tensorflow/core/common_runtime/gpu/gpu_device.h @@ -65,6 +65,11 @@ class BaseGPUDevice : public LocalDevice { // completes. bool RequiresRecordingAccessedTensors() const override; + // GPU kernel execution requires us to use `tracing::ScopedAnnotation()` + // rather than `tracing::ScopedActivity()`, in order to relate asynchronously + // launched GPU kernels to the OpKernel. + bool TraceUsingAnnotations() const { return true; } + void ConsumeListOfAccessedTensors( DeviceContext* device_context, const TensorReferenceVector& tensor_refs) override; diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc index db5022d56e..873182371e 100644 --- a/tensorflow/core/common_runtime/local_device.cc +++ b/tensorflow/core/common_runtime/local_device.cc @@ -62,7 +62,7 @@ struct LocalDevice::EigenThreadPoolInfo { LocalDevice::LocalDevice(const SessionOptions& options, const DeviceAttributes& attributes) - : TracingDevice(options.env, attributes), owned_tp_info_(nullptr) { + : Device(options.env, attributes), owned_tp_info_(nullptr) { // Log info messages if TensorFlow is not compiled with instructions that // could speed up performance and are available on the current CPU. port::InfoAboutUnusedCPUFeatures(); diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h index 9a82fb7204..226f121bf3 100644 --- a/tensorflow/core/common_runtime/local_device.h +++ b/tensorflow/core/common_runtime/local_device.h @@ -17,7 +17,6 @@ limitations under the License. #define TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_DEVICE_H_ #include "tensorflow/core/common_runtime/device.h" -#include "tensorflow/core/common_runtime/tracing_device.h" #include "tensorflow/core/framework/device_attributes.pb.h" #include "tensorflow/core/platform/macros.h" @@ -32,7 +31,7 @@ struct SessionOptions; // initializes a shared Eigen compute device used by both. This // should eventually be removed once we refactor ThreadPoolDevice and // GPUDevice into more 'process-wide' abstractions. -class LocalDevice : public TracingDevice { +class LocalDevice : public Device { public: LocalDevice(const SessionOptions& options, const DeviceAttributes& attributes); diff --git a/tensorflow/core/common_runtime/tracing_device.h b/tensorflow/core/common_runtime/tracing_device.h deleted file mode 100644 index e1b163074f..0000000000 --- a/tensorflow/core/common_runtime/tracing_device.h +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_TRACING_DEVICE_H_ -#define TENSORFLOW_CORE_COMMON_RUNTIME_TRACING_DEVICE_H_ - -#include "tensorflow/core/common_runtime/device.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/tracing.h" - -namespace tensorflow { - -namespace test { -class Benchmark; -} -struct SessionOptions; - -// This class implements tracing functionality that is shared by its subclasses -// (including ThreadPoolDevice and XlaDevice). -class TracingDevice : public Device { - public: - TracingDevice(Env* env, const DeviceAttributes& attributes) - : Device(env, attributes) {} - - void Compute(OpKernel* op_kernel, OpKernelContext* context) override { - const tracing::TraceCollector* trace_collector = - tracing::GetTraceCollector(); - if (TF_PREDICT_FALSE( - (trace_collector && - trace_collector->IsEnabled(op_kernel->IsExpensive())) || - tracing::GetEventCollector(tracing::EventCategory::kCompute))) { - const string& op_name = op_kernel->name(); - tracing::ScopedActivity activity(op_name, op_kernel->type_string(), - op_kernel->IsExpensive()); - tracing::ScopedRegion region(tracing::EventCategory::kCompute, op_name); - op_kernel->Compute(context); - } else { - op_kernel->Compute(context); - } - } - - private: - TF_DISALLOW_COPY_AND_ASSIGN(TracingDevice); -}; - -} // namespace tensorflow - -#endif // TENSORFLOW_CORE_COMMON_RUNTIME_TRACING_DEVICE_H_ -- cgit v1.2.3