diff options
author | Shanqing Cai <cais@google.com> | 2017-04-29 08:26:06 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-04-29 09:48:44 -0700 |
commit | a25509eda3e42dddc88155965eaffe4b3c455af5 (patch) | |
tree | 79bc1880238cd291c64123c2c241122da1c000b5 /tensorflow/core/common_runtime/direct_session.cc | |
parent | ea25bc496e30ecc1f90622906390669913d92e74 (diff) |
Add TFDBG support to GrpcSession
* Along the way, unify the way the debugger works in DirectSession (non-distributed Sessions) and MasterSession (for distributed Sessions).
* The SummarizDebugTensorWatches method is invoked in DirectSession::GetOrCreateExecutors() and MasterSession::HashBuildGraphOptions() method to generate keys for partition graphs and executors.
* The DebugStateInterface::PublishDebugMetadata() method is used to send metadata about the debugged Session::Run() call to debug URLs. This happens in DirectSession::Run() and MasterSession::DoRunWithLocalExecution() respectively.
* The DebugGraphDecoratorInterface::DecorateGraph() and DebugGraphDecoratorInterface::PublishGraph() methods are used to insert debug ops to the debugged graph and send the modified graph to debug URLs. This happens in DirectSession::GetOrCreateExecutors() and GraphMgr::InitItem(), respectively.
Change: 154631802
Diffstat (limited to 'tensorflow/core/common_runtime/direct_session.cc')
-rw-r--r-- | tensorflow/core/common_runtime/direct_session.cc | 79 |
1 files changed, 58 insertions, 21 deletions
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc index f208e4b78e..7c017f9584 100644 --- a/tensorflow/core/common_runtime/direct_session.cc +++ b/tensorflow/core/common_runtime/direct_session.cc @@ -370,6 +370,43 @@ Status DirectSession::Run(const NamedTensorList& inputs, &run_metadata); } +Status DirectSession::CreateDebuggerState( + const DebugOptions& debug_options, int64 session_run_count, + int64 executor_step_count, const std::vector<string>& input_names, + const std::vector<string>& output_names, + const std::vector<string>& target_names, + std::unique_ptr<DebuggerStateInterface>* debugger_state) { + std::unique_ptr<DebuggerStateInterface> state = + DebuggerStateRegistry::CreateState(debug_options); + if (!state) { + return errors::Internal( + "Debugger options are set, but creation of debugger state failed. " + "It appears that debugger is not linked in this TensorFlow build."); + } + + TF_RETURN_IF_ERROR(state->PublishDebugMetadata( + debug_options.global_step(), session_run_count, executor_step_count, + input_names, output_names, target_names)); + + *debugger_state = std::move(state); + return Status::OK(); +} + +Status DirectSession::DecorateAndPublishGraphForDebug( + const DebugOptions& debug_options, Graph* graph, Device* device) { + std::unique_ptr<DebugGraphDecoratorInterface> decorator = + DebugGraphDecoratorRegistry::CreateDecorator(debug_options); + if (!decorator) { + return errors::Internal( + "Debugger options are set, but creation of debug graph publisher ", + "failed."); + } + + TF_RETURN_IF_ERROR(decorator->DecorateGraph(graph, device)); + TF_RETURN_IF_ERROR(decorator->PublishGraph(*graph)); + return Status::OK(); +} + Status DirectSession::Run(const RunOptions& run_options, const NamedTensorList& inputs, const std::vector<string>& output_names, @@ -402,27 +439,21 @@ Status DirectSession::Run(const RunOptions& run_options, // Check if we already have an executor for these arguments. ExecutorsAndKeys* executors_and_keys; - RunStateArgs run_state_args; + RunStateArgs run_state_args(run_options.debug_options()); Executor::Args args; args.step_id = step_id_counter_.fetch_add(1); - // EXPERIMENTAL: Options that allow the client to insert nodes into partition - // graphs for debugging. - if (!run_options.debug_options().debug_tensor_watch_opts().empty()) { - run_state_args.debugger_state = - DebuggerStateRegistry::CreateState(run_options.debug_options()); - } - TF_RETURN_IF_ERROR( GetOrCreateExecutors(pool, input_tensor_names, output_names, target_nodes, &executors_and_keys, &run_state_args)); const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1); - if (run_state_args.debugger_state) { - TF_RETURN_IF_ERROR(run_state_args.debugger_state->PublishDebugMetadata( - run_options.debug_options().global_step(), args.step_id, - executor_step_count, input_tensor_names, output_names, target_nodes)); + std::unique_ptr<DebuggerStateInterface> debugger_state; + if (!run_options.debug_options().debug_tensor_watch_opts().empty()) { + TF_RETURN_IF_ERROR(CreateDebuggerState( + run_options.debug_options(), args.step_id, executor_step_count, + input_tensor_names, output_names, target_nodes, &debugger_state)); } // Configure a call frame for the step, which we use to feed and @@ -629,7 +660,9 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names, // Check if we already have an executor for these arguments. ExecutorsAndKeys* executors_and_keys; - RunStateArgs run_state_args; + // TODO(cais): TFDBG support for partial runs. + DebugOptions debug_options; + RunStateArgs run_state_args(debug_options); run_state_args.is_partial_run = true; TF_RETURN_IF_ERROR(GetOrCreateExecutors(pool, input_names, output_names, target_nodes, &executors_and_keys, @@ -960,14 +993,15 @@ Status DirectSession::GetOrCreateExecutors( thread::ThreadPool* pool, gtl::ArraySlice<string> inputs, gtl::ArraySlice<string> outputs, gtl::ArraySlice<string> target_nodes, ExecutorsAndKeys** executors_and_keys, RunStateArgs* run_state_args) { - string debug_tensor_watches_summary; int64 handle_name_counter_value = -1; if (LogMemory::IsEnabled() || run_state_args->is_partial_run) { handle_name_counter_value = handle_name_counter_.fetch_add(1); } - if (run_state_args->debugger_state) { - debug_tensor_watches_summary = - run_state_args->debugger_state->SummarizeDebugTensorWatches(); + + string debug_tensor_watches_summary; + if (!run_state_args->debug_options.debug_tensor_watch_opts().empty()) { + debug_tensor_watches_summary = SummarizeDebugTensorWatches( + run_state_args->debug_options.debug_tensor_watch_opts()); } // Fast lookup path, no sorting. @@ -1032,6 +1066,9 @@ Status DirectSession::GetOrCreateExecutors( options.fetch_endpoints = outputs_sorted; options.target_nodes = tn_sorted; options.use_function_convention = !run_state_args->is_partial_run; + if (!run_state_args->debug_options.debug_tensor_watch_opts().empty()) { + options.debug_options = run_state_args->debug_options; + } std::shared_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys); @@ -1107,10 +1144,10 @@ Status DirectSession::GetOrCreateExecutors( optimizer.Optimize(lib, options_.env, device, &iter->second); - // EXPERIMENTAL: tfdbg inserts debug nodes (i.e., probes) to the graph - if (run_state_args->debugger_state) { - TF_RETURN_IF_ERROR(run_state_args->debugger_state->DecorateGraphForDebug( - partition_graph.get(), params.device)); + // EXPERIMENTAL: tfdbg inserts debug nodes in the graph. + if (!options.debug_options.debug_tensor_watch_opts().empty()) { + TF_RETURN_IF_ERROR(DecorateAndPublishGraphForDebug( + options.debug_options, partition_graph.get(), params.device)); } TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device->device_type()), |