aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/common_runtime
diff options
context:
space:
mode:
authorGravatar Derek Murray <mrry@google.com>2018-09-24 14:34:57 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-09-24 14:39:31 -0700
commitaab3c53e1484404a70565324d1231c4e6ead7425 (patch)
treeec04217c08795c7a30ee5bf7af86d7f2b81ec53c /tensorflow/core/common_runtime
parent567a7196494a79988337bcb726c0c5d453298c99 (diff)
Inline kernel tracing logic into `ExecutorState::Process()`.
All devices implement the same tracing logic in an override of `Device::Compute()`. However, that logic does not have access to the cached `NodeItem::kernel_is_expensive` bit for the kernel, so it must make a virtual call to `OpKernel::IsExpensive()`. By inlining the logic into `ExecutorState::Process()`, we avoid making an unnecessary virtual call on each kernel invocation (when a trace controller is attached). PiperOrigin-RevId: 214332492
Diffstat (limited to 'tensorflow/core/common_runtime')
-rw-r--r--tensorflow/core/common_runtime/device.h6
-rw-r--r--tensorflow/core/common_runtime/executor.cc62
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.cc5
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.h5
-rw-r--r--tensorflow/core/common_runtime/local_device.cc2
-rw-r--r--tensorflow/core/common_runtime/local_device.h3
-rw-r--r--tensorflow/core/common_runtime/tracing_device.h60
7 files changed, 74 insertions, 69 deletions
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index fb76d6ac29..2ef1547cd9 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -101,6 +101,12 @@ class Device : public DeviceBase {
}
}
+ // If true, and tracing is enabled, the `tracing::ScopedAnnotation()` tracing
+ // mechanism will be used instead of `tracing::ScopedActivity()`. Some devices
+ // may override this method to use annotations, which enable child activities
+ // (such as GPU kernel launches) to be related to the OpKernel invocation.
+ virtual bool TraceUsingAnnotations() const { return false; }
+
// Blocks until all operations queued on the device at the time of
// the call have completed. Returns any error pending on the device
// at completion.
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 7cef34ac52..2c48084cab 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1238,6 +1238,9 @@ class ExecutorState {
// Step-local container.
ScopedStepContainer* step_container_;
StepStatsCollectorInterface* const stats_collector_;
+ const tracing::TraceCollector* const trace_collector_;
+ const tracing::EventCollector* const event_collector_;
+
// QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper
// instead of a pointer? (avoids having to delete).
checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache_;
@@ -1246,6 +1249,7 @@ class ExecutorState {
CancellationManager* cancellation_manager_;
Executor::Args::Runner runner_;
bool sync_on_finish_;
+ const bool trace_using_annotations_;
// Owned.
@@ -1360,12 +1364,16 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
tensor_store_(args.tensor_store),
step_container_(args.step_container),
stats_collector_(args.stats_collector),
+ trace_collector_(tracing::GetTraceCollector()),
+ event_collector_(
+ tracing::GetEventCollector(tracing::EventCategory::kCompute)),
slice_reader_cache_(new checkpoint::TensorSliceReaderCacheWrapper),
call_frame_(args.call_frame),
impl_(impl),
cancellation_manager_(args.cancellation_manager),
runner_(args.runner),
sync_on_finish_(args.sync_on_finish),
+ trace_using_annotations_(impl->params_.device->TraceUsingAnnotations()),
num_outstanding_ops_(0) {
// We start the entire execution in iteration 0 of the root frame
// so let us create the root frame and the state for iteration 0.
@@ -1551,6 +1559,32 @@ struct ExecutorState::AsyncState {
}
};
+// Returns true if `item` might be traced by the given trace and event
+// collectors. Returns false only if `item` definitely will not be traced.
+bool MightTrace(const NodeItem& item,
+ const tracing::TraceCollector* trace_collector,
+ const tracing::EventCollector* event_collector,
+ bool using_annotations) {
+ // Tracing will only be enabled if either `event_collector` is non null,
+ // or `trace_collector` is non-null and enabled for this particular kernel.
+ // Although `tracing::ScopedActivity`,
+ // `tracing::ScopedAnnotation`, and `tracing::ScopedRegion` check subsets of
+ // these properties internally in their constructors, the cost of passing the
+ // necessary arguments to them can be significant, so we avoid constructing
+ // them in the common case (when we know they will not be used).
+ if (event_collector != nullptr) {
+ return true;
+ }
+ if (trace_collector) {
+ if (using_annotations) {
+ return trace_collector->IsEnabledForAnnotations();
+ } else {
+ return trace_collector->IsEnabledForActivities(item.kernel_is_expensive);
+ }
+ }
+ return false;
+}
+
void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
const GraphView& gview = impl_->gview_;
TaggedNodeSeq ready;
@@ -1585,6 +1619,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
Status s;
NodeExecStatsInterface* stats = nullptr;
+
EntryVector outputs;
bool completed = false;
inline_ready.push_back(tagged_node);
@@ -1721,7 +1756,32 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
// Synchronous computes.
OpKernelContext ctx(&params, item.num_outputs);
nodestats::SetOpStart(stats);
- device->Compute(CHECK_NOTNULL(op_kernel), &ctx);
+
+ if (TF_PREDICT_FALSE(MightTrace(item, trace_collector_,
+ event_collector_,
+ trace_using_annotations_))) {
+ const string& op_name = op_kernel->name();
+ tracing::ScopedRegion region(tracing::EventCategory::kCompute,
+ op_name);
+ if (trace_using_annotations_) {
+ // The OpKernel may create child activities (such as GPU kernel
+ // launches), so use a `ScopedAnnotation` to relate these activities
+ // in the trace.
+ tracing::ScopedAnnotation activity(op_name,
+ op_kernel->type_string());
+ device->Compute(op_kernel, &ctx);
+ } else {
+ // Use the cheaper `ScopedActivity` to trace just the OpKernel
+ // execution.
+ tracing::ScopedActivity activity(op_name, op_kernel->type_string(),
+ item.kernel_is_expensive);
+ device->Compute(op_kernel, &ctx);
+ }
+ } else {
+ // In the common case, avoid creating any tracing objects.
+ device->Compute(op_kernel, &ctx);
+ }
+
nodestats::SetOpEnd(stats);
s = ProcessOutputs(item, &ctx, &outputs, stats);
if (s.ok() && impl_->device_record_tensor_accesses_) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index cf3faf68ff..d8ebdeff5d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -434,9 +434,6 @@ Status BaseGPUDevice::FillContextMap(const Graph* graph,
}
void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
- tracing::ScopedRegion region(tracing::EventCategory::kCompute,
- op_kernel->name());
-
// NOTE(tucker): We need to discriminate between Eigen GPU
// operations and all others. If an operation is Eigen
// implemented (or otherwise tries to launch a cuda kernel
@@ -450,8 +447,6 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
context->SetStatus(errors::Internal(
"Invalid synchronous 'Compute' on GPU for '_Recv' op"));
} else {
- tracing::ScopedAnnotation annotation(op_kernel->name(),
- op_kernel->type_string());
ComputeHelper(op_kernel, context);
}
}
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index b25fe8645f..674e8384d5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -65,6 +65,11 @@ class BaseGPUDevice : public LocalDevice {
// completes.
bool RequiresRecordingAccessedTensors() const override;
+ // GPU kernel execution requires us to use `tracing::ScopedAnnotation()`
+ // rather than `tracing::ScopedActivity()`, in order to relate asynchronously
+ // launched GPU kernels to the OpKernel.
+ bool TraceUsingAnnotations() const { return true; }
+
void ConsumeListOfAccessedTensors(
DeviceContext* device_context,
const TensorReferenceVector& tensor_refs) override;
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index db5022d56e..873182371e 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -62,7 +62,7 @@ struct LocalDevice::EigenThreadPoolInfo {
LocalDevice::LocalDevice(const SessionOptions& options,
const DeviceAttributes& attributes)
- : TracingDevice(options.env, attributes), owned_tp_info_(nullptr) {
+ : Device(options.env, attributes), owned_tp_info_(nullptr) {
// Log info messages if TensorFlow is not compiled with instructions that
// could speed up performance and are available on the current CPU.
port::InfoAboutUnusedCPUFeatures();
diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h
index 9a82fb7204..226f121bf3 100644
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@@ -17,7 +17,6 @@ limitations under the License.
#define TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_DEVICE_H_
#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/tracing_device.h"
#include "tensorflow/core/framework/device_attributes.pb.h"
#include "tensorflow/core/platform/macros.h"
@@ -32,7 +31,7 @@ struct SessionOptions;
// initializes a shared Eigen compute device used by both. This
// should eventually be removed once we refactor ThreadPoolDevice and
// GPUDevice into more 'process-wide' abstractions.
-class LocalDevice : public TracingDevice {
+class LocalDevice : public Device {
public:
LocalDevice(const SessionOptions& options,
const DeviceAttributes& attributes);
diff --git a/tensorflow/core/common_runtime/tracing_device.h b/tensorflow/core/common_runtime/tracing_device.h
deleted file mode 100644
index e1b163074f..0000000000
--- a/tensorflow/core/common_runtime/tracing_device.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_TRACING_DEVICE_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_TRACING_DEVICE_H_
-
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/tracing.h"
-
-namespace tensorflow {
-
-namespace test {
-class Benchmark;
-}
-struct SessionOptions;
-
-// This class implements tracing functionality that is shared by its subclasses
-// (including ThreadPoolDevice and XlaDevice).
-class TracingDevice : public Device {
- public:
- TracingDevice(Env* env, const DeviceAttributes& attributes)
- : Device(env, attributes) {}
-
- void Compute(OpKernel* op_kernel, OpKernelContext* context) override {
- const tracing::TraceCollector* trace_collector =
- tracing::GetTraceCollector();
- if (TF_PREDICT_FALSE(
- (trace_collector &&
- trace_collector->IsEnabled(op_kernel->IsExpensive())) ||
- tracing::GetEventCollector(tracing::EventCategory::kCompute))) {
- const string& op_name = op_kernel->name();
- tracing::ScopedActivity activity(op_name, op_kernel->type_string(),
- op_kernel->IsExpensive());
- tracing::ScopedRegion region(tracing::EventCategory::kCompute, op_name);
- op_kernel->Compute(context);
- } else {
- op_kernel->Compute(context);
- }
- }
-
- private:
- TF_DISALLOW_COPY_AND_ASSIGN(TracingDevice);
-};
-
-} // namespace tensorflow
-
-#endif // TENSORFLOW_CORE_COMMON_RUNTIME_TRACING_DEVICE_H_