diff options
author | 2017-07-04 10:19:42 -0700 | |
---|---|---|
committer | 2017-07-04 10:23:59 -0700 | |
commit | af23ae65db2585f4a18d0bc5f21f15e94805aa4f (patch) | |
tree | a805f64d0a85fa29ff69d204634379b80cdbcbf1 /tensorflow/core/profiler/g3doc | |
parent | 11ec8b7cfdec0fd498182d0ad8f550b4a8ddaf13 (diff) |
Migrating tfprof python API to tensorflow/python/profiler
Migrating tfprof c++ to tensorflow/core/profiler
API changes:
New tf.profiler namespace.
Within tf.profiler namespace:
tf.profiler.advise # One-shot advise function
tf.profiler.profile # One-shot profile function
tf.profiler.Profiler # Multi-step profile/advise class
tf.profiler.write_op_log # Write profile for offline analysis
PiperOrigin-RevId: 160901831
Diffstat (limited to 'tensorflow/core/profiler/g3doc')
-rw-r--r-- | tensorflow/core/profiler/g3doc/advise.md | 108 | ||||
-rw-r--r-- | tensorflow/core/profiler/g3doc/code_timeline.png | bin | 0 -> 45674 bytes | |||
-rw-r--r-- | tensorflow/core/profiler/g3doc/command_line.md | 316 | ||||
-rw-r--r-- | tensorflow/core/profiler/g3doc/graph_timeline.png | bin | 0 -> 93489 bytes | |||
-rw-r--r-- | tensorflow/core/profiler/g3doc/options.md | 86 | ||||
-rw-r--r-- | tensorflow/core/profiler/g3doc/profile_memory.md | 81 | ||||
-rw-r--r-- | tensorflow/core/profiler/g3doc/profile_model_architecture.md | 92 | ||||
-rw-r--r-- | tensorflow/core/profiler/g3doc/profile_time.md | 179 | ||||
-rw-r--r-- | tensorflow/core/profiler/g3doc/python_api.md | 144 | ||||
-rw-r--r-- | tensorflow/core/profiler/g3doc/scope_timeline.png | bin | 0 -> 24944 bytes |
10 files changed, 1006 insertions, 0 deletions
diff --git a/tensorflow/core/profiler/g3doc/advise.md b/tensorflow/core/profiler/g3doc/advise.md new file mode 100644 index 0000000000..6973c6a84c --- /dev/null +++ b/tensorflow/core/profiler/g3doc/advise.md @@ -0,0 +1,108 @@ +## Auto Detect and Advise + +tfprof analyzes profiles and generates advises for common issues. + +### Run Advise. + +```python +# First create a profiler. See profiler tutorials for more details. +profiler = tf.profiler.Profiler(sess.graph) +run_meta = config_pb2.RunMetadata() +_ = sess.run(r1, + options=config_pb2.RunOptions( + trace_level=config_pb2.RunOptions.FULL_TRACE), + run_metadata=run_meta) +profiler.add_step(1, run_meta) + +# Then Start advise. +profiler.advise(tf.contrib.tfprof.model_analyzer.ALL_ADVICE) + +# For one-shot API +tf.profiler.advise( + sess.graph, run_meta=run_metadata) +``` + +```shell +# Run advisor on CLI +# See CLI tutorial on generating the files. +tfprof --graph_path=graph.pbtxt \ + --run_meta_path=run_metadata \ + --op_log_path=tfprof_log + +tfprof> advise +AcceleratorUtilizationChecker: +device: /job:worker/replica:0/task:0/gpu:0 low utilization: 0.03 +device: /job:worker/replica:0/task:0/gpu:1 low utilization: 0.08 +device: /job:worker/replica:0/task:0/gpu:2 low utilization: 0.04 +device: /job:worker/replica:0/task:0/gpu:3 low utilization: 0.21 + +OperationChecker: +Found operation using NHWC data_format on GPU. Maybe NCHW is faster. + +ExpensiveOperationChecker: +top 1 operation type: SoftmaxCrossEntropyWithLogits, cpu: 1.37sec, accelerator: 0us, total: 1.37sec (26.68%) +top 2 operation type: MatMul, cpu: 427.39ms, accelerator: 280.76ms, total: 708.14ms (13.83%) +top 3 operation type: ConcatV2, cpu: 357.83ms, accelerator: 31.80ms, total: 389.63ms (7.61%) +seq2seq_attention_model.py:360:build_graph:self._add_seq2seq(), cpu: 3.16sec, accelerator: 214.84ms, total: 3.37sec + seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ..., cpu: 2.46sec, accelerator: 3.25ms, total: 2.47sec + seq2seq_lib.py:181:sampled_sequence_...:average_across_ti..., cpu: 2.46sec, accelerator: 3.24ms, total: 2.47sec + seq2seq_lib.py:147:sequence_loss_by_...:crossent = loss_f..., cpu: 2.46sec, accelerator: 3.06ms, total: 2.46sec + seq2seq_attention_model.py:289:sampled_loss_func:num_classes=vsize), cpu: 2.46sec, accelerator: 3.06ms, total: 2.46sec + seq2seq_attention_model.py:282:sampled_loss_func:labels = tf.resha..., cpu: 164us, accelerator: 0us, total: 164us + seq2seq_lib.py:148:sequence_loss_by_...:log_perp_list.app..., cpu: 1.33ms, accelerator: 120us, total: 1.45ms + seq2seq_lib.py:151:sequence_loss_by_...:total_size = tf.a..., cpu: 154us, accelerator: 23us, total: 177us + seq2seq_lib.py:184:sampled_sequence_...:return cost / tf...., cpu: 97us, accelerator: 8us, total: 105us + math_ops.py:690:cast:return gen_math_o..., cpu: 62us, accelerator: 3us, total: 65us + math_ops.py:839:binary_op_wrapper:return func(x, y,..., cpu: 35us, accelerator: 5us, total: 40us + seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a..., cpu: 651.56ms, accelerator: 158.92ms, total: 810.48ms + seq2seq_lib.py:104:bidirectional_rnn:sequence_length, ..., cpu: 306.58ms, accelerator: 73.54ms, total: 380.12ms + core_rnn.py:195:static_rnn:state_size=cell.s..., cpu: 306.52ms, accelerator: 73.54ms, total: 380.05ms + rnn.py:218:_rnn_step:_maybe_copy_some_..., cpu: 303.76ms, accelerator: 73.54ms, total: 377.30ms + rnn.py:216:_rnn_step:time >= max_seque..., cpu: 2.75ms, accelerator: 0us, total: 2.75ms + core_rnn.py:179:static_rnn:max_sequence_leng..., cpu: 67us, accelerator: 0us, total: 67us + seq2seq_lib.py:110:bidirectional_rnn:initial_state_bw,..., cpu: 296.21ms, accelerator: 73.54ms, total: 369.75ms + core_rnn.py:195:static_rnn:state_size=cell.s..., cpu: 296.11ms, accelerator: 73.54ms, total: 369.65ms + rnn.py:218:_rnn_step:_maybe_copy_some_..., cpu: 292.04ms, accelerator: 73.54ms, total: 365.58ms + rnn.py:216:_rnn_step:time >= max_seque..., cpu: 4.07ms, accelerator: 0us, total: 4.07ms + core_rnn.py:178:static_rnn:min_sequence_leng..., cpu: 85us, accelerator: 0us, total: 85us + core_rnn.py:179:static_rnn:max_sequence_leng..., cpu: 16us, accelerator: 0us, total: 16us + seq2seq_lib.py:113:bidirectional_rnn:outputs = [tf.con..., cpu: 46.88ms, accelerator: 3.87ms, total: 50.75ms + ...(omitted) +top 1 graph node: seq2seq/loss/sampled_sequence_loss/sequence_loss_by_example/SoftmaxCrossEntropyWithLogits_11, cpu: 89.92ms, accelerator: 0us, total: 89.92ms +top 2 graph node: train_step/update_seq2seq/output_projection/w/ApplyAdam, cpu: 84.52ms, accelerator: 0us, total: 84.52ms +top 3 graph node: seq2seq/loss/sampled_sequence_loss/sequence_loss_by_example/SoftmaxCrossEntropyWithLogits_19, cpu: 73.02ms, accelerator: 0us, total: 73.02ms +``` + +### Checker + +There is no magic behind advise mode. tfprof builds the profiles first, then +it runs through a list of `Checkers`, each one responsible for checking one +area with the profile and report issues. A `Checker` is like a plugin. + +For example: + +#### JobChecker (Not Available OSS) + +* Checks RecvTensor RPC latency and bandwidth. +* Checks CPU/Memory utilization of the job. + +####AcceleratorUtilization Checker +* Checks what percentage of time the accelerator spends on computation. + +#### OperationChecker + +* Checks whether the operation runs with optimal options. +* Checks if there is a better implementation to replace the current operation. + +#### ExpensiveOperationChecker + +* Checks the most expensive operation type. +* Checks the most expensive graph nodes. +* Checks the most expensive graph-building Python codes. + +####Contribute Your Checker + +Follow examples of accelerator_utilization_checker.h + + + diff --git a/tensorflow/core/profiler/g3doc/code_timeline.png b/tensorflow/core/profiler/g3doc/code_timeline.png Binary files differnew file mode 100644 index 0000000000..c5ab246f7d --- /dev/null +++ b/tensorflow/core/profiler/g3doc/code_timeline.png diff --git a/tensorflow/core/profiler/g3doc/command_line.md b/tensorflow/core/profiler/g3doc/command_line.md new file mode 100644 index 0000000000..06ebe8e949 --- /dev/null +++ b/tensorflow/core/profiler/g3doc/command_line.md @@ -0,0 +1,316 @@ +## Command Line Interface Tutorials + +* [Command Line Inputs](#command-line-inputs) +* [Start `tfprof`](#start-tfprof) +* [Examples](#examples) + * [Profile Python Time](#profile-python-time) + * [Profile Graph Time](#profile-graph-time) + * [Profile Checkpoint Value](#profile-checkpoint-value) + * [Profile Model Parameter](#profile-model-parameter) + * [Profile Device Placement](#profile-device-placement) + * [Define Customized Operation Type](#define-customized-operation-type) + * [Non-interactive Mode](#non-interactive-mode) + + +### Command Line Inputs + +tfprof command line tool uses the following inputs: + +<b>--graph_path:</b> GraphDef text file (required). Used to build in-memory +architecture of the model. For example, graph.pbtxt written by tf.Supervisor +can be passed to --graph_path. You can also easily get GraphDef using +tf.get_default_graph().as_graph_def(add_shapes=True) or other API. + +<b>--run_meta_path:</b> tensorflow::RunMetadata (optional). +Used to get the memory consumption and execution time of +each op of the model. + +The following code snippet writes a RunMetadata file: + +```python +run_options = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE) +run_metadata = config_pb2.RunMetadata() +_ = self._sess.run(..., options=run_options, run_metadata=run_metadata) +with tf.gfile.Open(os.path.join(output_dir, "run_meta"), "w") as f: + f.write(run_metadata.SerializeToString()) +``` + +<b>--op_log_path:</b> +tensorflow::tfprof::OpLog (optional). A proto used to provide extra operation +information. 1) float operations. 2) code traces. 3) define customized operation +type for -account_type_regexes option. + +The following code snippet writes a OpLog file. + +```python +tf.profiler.write_op_log(graph, log_dir, op_log=None) +``` + +<b>--checkpoint_path:</b> TensorFlow checkpoint (optional). +It defines _checkpoint_variable op type. It also provides checkpointed tensors' values. + + +###Start `tfprof` + +#### Build `tfprof` + +```shell +# Build the tool. +bazel build --config opt tensorflow/core/profiler/... + +# Help information, including detail 'option' instructions. +bazel-bin/tensorflow/core/profiler/tfprof help +``` + +#### Start `tfprof` Interactive Mode +```shell +# The following commands will start tfprof interactive mode. +# +# --graph_path contains the model architecutre and tensor shapes. +# --run_meta_path contains the memory and time information. +# --op_log_path contains float operation and code traces. +# --checkpoint_path contains the model checkpoint data. +# +# Only includes model architecture, parameters and shapes. +bazel-bin/tensorflow/core/profiler/tfprof \ + --graph_path=graph.pbtxt +# +# Additionally profile ops memory and timing. +bazel-bin/tensorflow/core/profiler/tfprof \ + --graph_path=graph.pbtxt \ + --run_meta_path=run_meta \ +# +# tfprof_log is used to define customized op types, float ops and code traces. +# Use tfprof_logger.write_op_log() to create tfprof_log. +bazel-bin/tensorflow/core/profiler/tfprof \ + --graph_path=graph.pbtxt \ + --run_meta_path=run_meta \ + --op_log_path=tfprof_log \ +# +# Additionally profile checkpoint statistics and values. +# Use '-account_type_regexes _checkpoint_variables' to select +# checkpoint tensors. +bazel-bin/tensorflow/core/profiler/tfprof \ + --graph_path=graph.pbtxt \ + --run_meta_path=run_meta \ + --op_log_path=tfprof_log \ + --checkpoint_path=model.ckpt +``` + +#### Start `tfprof` Non-interactive Mode. + +```python +# Runs tfprof in one-shot. +bazel-bin/tensorflow/core/profiler/tfprof scope \ + --graph_path=graph.pbtxt \ + --max_depth=3 +``` + +#### Press enter to show the default options + +Refer to [Options](options.md) for option instructions. + +```shell +tfprof> +-max_depth 4 +-min_bytes 0 +-min_micros 0 +-min_params 0 +-min_float_ops 0 +-min_occurrence 0 +-step -1 +-order_by name +-account_type_regexes Variable,VariableV2 +-start_name_regexes .* +-trim_name_regexes +-show_name_regexes .* +-hide_name_regexes IsVariableInitialized_[0-9]+,save\/.*,^zeros[0-9_]* +-account_displayed_op_only false +# supported select fields. Availability depends on --[run_meta|checkpoint|op_log]_path. +# [bytes|micros|params|float_ops|occurrence|tensor_value|device|op_types] +-select params +# format: output_type:key=value,key=value... +# output_types: stdout (default), timeline, file. +# key=value pairs: +# 1. timeline: outfile=<filename> +# 2. file: outfile=<filename> +# 3. stdout: None. +# E.g. timeline:outfile=/tmp/timeline.json +-output +``` + +###Examples + +####Profile Python Time +```shell +# Requires --graph_path --op_log_path +tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros +_TFProfRoot (0us/22.44ms) + model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms) + model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms) + model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms) + model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms) + model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms) + model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us) + model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us) + model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us) + model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us) + ... + model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us) + model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us) + model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us) +``` + +Set ```-output timeline:outfile=<filename>``` to generate timeline instead of stdout. +<left> +![CodeTimeline](code_timeline.png) +</left> + +#### Profile Graph Time + +```shell +# I defined an op named ‘cost’ to calculate the loss. I want to know what ops +# it depends on take a long time to run. + +# Requires --graph_path, --run_meta_path. +tfprof> graph -start_name_regexes cost.* -max_depth 100 -min_micros 10000 -select micros -account_type_regexes .* +_TFProfRoot (0us/3.61sec) + init/init_conv/Conv2D (11.75ms/3.10sec) + random_shuffle_queue_DequeueMany (3.09sec/3.09sec) + unit_1_0/sub2/conv2/Conv2D (74.14ms/3.19sec) + unit_1_3/sub2/conv2/Conv2D (60.75ms/3.34sec) + unit_2_4/sub2/conv2/Conv2D (73.58ms/3.54sec) + unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec) +``` + +#### Profile Checkpoint Value +```shell +# Requires --graph_path, --checkpoint_path. +tfprof> scope -show_name_regexes unit_1_0.*gamma -select tensor_value -max_depth 5 +_TFProfRoot () + unit_1_0/shared_activation/init_bn/gamma () +[1.80 2.10 2.06 1.91 2.26 1.86 1.81 1.37 1.78 1.85 1.96 1.54 2.04 2.34 2.22 1.99 ], + unit_1_0/sub2/bn2/gamma () +[1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ], +``` + +#### Profile Model Parameter + +```shell +# Show the number of parameters of all `tf.trainable_variables()` in the model. +# Requires --graph_path --op_log_path. +# store option for future commands. +tfprof> set -account_type_regexes _trainable_variables +tfprof> scope -max_depth 4 -select params +_TFProfRoot (--/464.15k params) + init/init_conv/DW (3x3x3x16, 432/432 params) + pool_logit/DW (64x10, 640/640 params) + pool_logit/biases (10, 10/10 params) + unit_last/final_bn/beta (64, 64/64 params) + unit_last/final_bn/gamma (64, 64/64 params) +``` + +Where does `_trainable_variables` come from? It is customized operation type +defined through the OpLog file. +Users can [Define Customized Operation Type](#define-customized-operation-type) + +<b>Following example shows importance of defining customized operation type.</b> +In this example, extra `Variables` are created by TensorFlow +implicitly and “/Momentum” is appended to their names. They shouldn't be +included in you “model capacity” calculation. + +```shell +tfprof> scope -account_type_regexes VariableV2 -max_depth 4 -select params +_TFProfRoot (--/930.58k params) + global_step (1/1 params) + init/init_conv/DW (3x3x3x16, 432/864 params) + pool_logit/DW (64x10, 640/1.28k params) + pool_logit/DW/Momentum (64x10, 640/640 params) + pool_logit/biases (10, 10/20 params) + pool_logit/biases/Momentum (10, 10/10 params) + unit_last/final_bn/beta (64, 64/128 params) + unit_last/final_bn/gamma (64, 64/128 params) + unit_last/final_bn/moving_mean (64, 64/64 params) + unit_last/final_bn/moving_variance (64, 64/64 params) +``` + +#### Profile Device Placement + +In this tutorial, a model is split +on several gpus at workers and several parameter servers. + +In tfprof, 'device' is an op_type. For example, if op1 and op2 are placed on +gpu:0. They share an operation type. + +```shell +bazel-bin/tensorflow/core/profiler/tfprof \ + --graph_path=/tmp/graph.pbtxt \ + --run_meta_path=/tmp/run_meta + +# Looks like ps task 1 is holding twice more parameters than task 0. +tfprof> scope -select device,params -account_type_regexes .*ps.*task:0.* -max_depth 1 +_TFProfRoot (--/25.81m params) +tfprof> scope -select device,params -account_type_regexes .*ps.*task:1.* -max_depth 1 +_TFProfRoot (--/58.84m params) +``` + +#### Define Customized Operation Type + +First, in Python code, create an `OpLog` proto and add op type +information to it: + +```python + +op_log = tfprof_log_pb2.OpLog() +entry = op_log.log_entries.add() +entry.name = 'pool_logit/DW' +entry.types.append('pool_logit') +entry = op_log.log_entries.add() +entry.name = 'pool_logit/biases' +entry.types.append('pool_logit') +``` + +Second, call write_op_log to write the OpLog proto. + +```python +tf.profiler.write_op_log( + sess.graph, /tmp/my_op_log_dir, op_log) + +# Get run-time shape information in order to fill shapes and get flops. +tf.profiler.write_op_log( + sess.graph, /tmp/my_op_log_dir, op_log, run_meta) +``` + +Third, when starting the tfprof tool, specify +"--op_log_path /tmp/my_op_log_dir/op_log" + +```shell +tfprof> scope -account_type_regexes pool_logit -max_depth 4 -select params +_TFProfRoot (--/650 params) + pool_logit/DW (64x10, 640/640 params) + pool_logit/biases (10, 10/10 params) +``` + +Note that `tf.profiler.write_op_log(...)` automatically +assigns all `Variables` inside `tf.trainable_variables()` a customized +operation type: `_trainable_variables`. + + +#### Non-interactive Mode +12) Run tfprof in one-shot mode and dump result to file. + +```shell +# By default output to stdout. Use -output option to change output types. +tfprof scope --graph_path=graph.pbtxt \ + --max_depth=3 \ + --output="file:outfile=/tmp/dump" +Reading Files... +Parsing GraphDef... +Preparing Views... + +cat /tmp/dump +_TFProfRoot (--/930.58k params) + global_step (0/0 params) + pool_logit/DW (64x10, 640/1.28k params) + pool_logit/biases (10, 10/20 params) +``` diff --git a/tensorflow/core/profiler/g3doc/graph_timeline.png b/tensorflow/core/profiler/g3doc/graph_timeline.png Binary files differnew file mode 100644 index 0000000000..98bfaa175f --- /dev/null +++ b/tensorflow/core/profiler/g3doc/graph_timeline.png diff --git a/tensorflow/core/profiler/g3doc/options.md b/tensorflow/core/profiler/g3doc/options.md new file mode 100644 index 0000000000..57f67c66fa --- /dev/null +++ b/tensorflow/core/profiler/g3doc/options.md @@ -0,0 +1,86 @@ +##Options + +###Overview + +For all tfprof views, the statistics are processed with the following procedures + +1) An in-memory data structure is used represent the view. + +2) `-account_type_regexes` is used to first select the operations that match + the specified operation types. An operation has its default type + (e.g. MatMul, Conv2D). `tfprof` also considers device as operation type. + User can also define customized operation type. Hence, an operation has + multiple types. Operations with matched + types are selected for display and their statistics are aggregated + by the in-memory data structure. + +3) Various `-xxx_name_regexes`, `-min_xxx`, `-max_depth` etc options are then + applied to further filter based on names and values. + It's no limited operation name. In code view, + it's the code trace. In op view, it's the operation type name. Different + from `-account_type_regexes`, Statistics are used even if a name is not displayed. + For example, in code view, a callee might be hidden, but its statistics is + still aggregated by it's caller. `-account_displayed_op_only`, however, + breaks the rule and only use statistics of displayed names. + +4) Finally, the filtered data structure is displayed in a format depending + on the `-output` option. + +####Option Semantics In Different View +options usually have the same semantics in different views. However, some +can vary. For example `-max_depth` in scope view means the depth of +name scope <b>tree</b>. In op view, it means the length of operation <b>list</b>. +In graph view, in means the number of hops in the <b>graph</b>. + + +###Docs + +`-max_depth`: Show ops that are at most this number of hops from starting op in the tree/graph structure. + +`-min_bytes`: Show ops that request at least this number of bytes. + +`-min_micros`: Show ops that spend at least this number of microseconds to run. + +`-min_params`: Show ops that contains at least this number of parameters. + +`-min_float_ops`: Show ops that contain at least this number of float operations. Only available if an op has op.RegisterStatistics() defined and OpLog is provided + +`-min_occurrence`: Show ops that appear at least this number of times. Only available in "op" view. + +`-step`: Show the stats of the this step when multiple steps of RunMetadata were added. By default, show the average of all steps." + +`-order_by`: Order the results by [name|depth|bytes|micros|accelerator_micros|cpu_micros|params|float_ops|occurrence] + +`-account_type_regexes`: Account and display the ops whose types match one of the type regexes specified. tfprof allow user to define extra op types for ops through tensorflow.tfprof.OpLog proto. regexes are comma-sperated. + +`-start_name_regexes`: Show ops starting from the ops that matches the regexes, recursively. regexes are comma-separated. + +`-trim_name_regexes`: Hide ops starting from the ops that matches the regexes, recursively, regexes are comma-seprated. + +`-show_name_regexes`: Show ops that match the regexes. regexes are comma-seprated. + +`-hide_name_regexes`: Hide ops that match the regexes. regexes are comma-seprated. + +Notes: For each op, `-account_type_regexes` is first evaluated, only ops with +types matching the specified regexes are accounted and selected for displayed. +`-start/trim/show/hide_name_regexes` are used to further filter ops for display. +`-start_name_regexes` is evaluated first to search the starting ops to display. +Descendants of starting ops are then evaluated against `-show/hide_name_regexes` +to make display decision. If an op matches trim_name_regexes, all its +descendants are hidden. Ops statistics are *accounted even if they are hidden* +as long as they match the `-account_xxx` options. + +`-account_displayed_op_only`: If True, only account the statistics of ops eventually displayed. If False, account all op statistics matching -account_type_regexes recursively. + +`-select`: Comma-separated list of metrics to show: +[bytes|micros|accelerator_micros|cpu_micros|params|float_ops|occurrence|tensor_value|device|op_types|input_shapes]. + +`-output`: Output results as stdout, file or timeline. +The format is ```output_type:key=value,key=value```. +For example: ```-output timeline:outfile=<filename>```. + +```shell +timeline: key=outfile, value=<filename>. +stdout: none. +file: key=outfile, value=<filename>. +``` diff --git a/tensorflow/core/profiler/g3doc/profile_memory.md b/tensorflow/core/profiler/g3doc/profile_memory.md new file mode 100644 index 0000000000..e897967d3b --- /dev/null +++ b/tensorflow/core/profiler/g3doc/profile_memory.md @@ -0,0 +1,81 @@ +##Profile Memory + +It is generally a good idea to visualize the memory usage in timeline. +It allows you to see the memory consumption of each GPU over time. + +```python +#To get memory information, you need --graph_path and --run_meta_path +tfprof> graph -max_depth 10000000 -step 0 -account_type_regexes .* -output timeline:outfile=<filename> +generating trace file. + +****************************************************** +Timeline file is written to <filename> +Open a Chrome browser, enter URL chrome://tracing and load the timeline file. +****************************************************** +``` + +<left> +TODO(xpan): Show the image correctly in github. +![Timeline](graph_timeline.png) +</left> + + +```python +# You can also visualize the memory information through other methods. + +# With op view, it shows you the aggregated output tensor bytes of each +# operation type. +tfprof> op -select bytes -order_by bytes +node name | output bytes +Identity 32515.37MB (100.00%, 27.02%) +FusedBatchNormGrad 10802.14MB (72.98%, 8.98%) +FusedBatchNorm 10517.52MB (64.01%, 8.74%) +Conv2D 10509.25MB (55.27%, 8.73%) +Conv2DBackpropInput 9701.39MB (46.54%, 8.06%) +ReluGrad 9206.45MB (38.48%, 7.65%) +Relu 8462.80MB (30.83%, 7.03%) +DepthwiseConv2dNativeBackpropInput 7899.35MB (23.80%, 6.56%) +DepthwiseConv2dNative 7425.17MB (17.23%, 6.17%) +MaxPoolGrad 3015.44MB (11.06%, 2.51%) +AddN 2741.49MB (8.56%, 2.28%) + +# With scope view, you can see the operations that outputs largest tensors. +tfprof> scope -order_by bytes -select bytes -min_bytes 100000000 +node name | output bytes +_TFProfRoot (--/120356.38MB) + tower_3/SepConv2d_2b_3x3/separable_conv2d (346.85MB/854.00MB) + tower_3/SepConv2d_2b_3x3/separable_conv2d/depthwise (507.15MB/507.15MB) + tower_0/SepConv2d_2b_3x3/separable_conv2d (346.85MB/693.71MB) + tower_0/SepConv2d_2b_3x3/separable_conv2d/depthwise (346.85MB/346.85MB) + tower_2/SepConv2d_2b_3x3/separable_conv2d (346.85MB/693.71MB) + tower_2/SepConv2d_2b_3x3/separable_conv2d/depthwise (346.85MB/346.85MB) + tower_1/SepConv2d_2b_3x3/separable_conv2d (346.85MB/693.71MB) + tower_1/SepConv2d_2b_3x3/separable_conv2d/depthwise (346.85MB/346.85MB) + tower_3/SepConv2d_2a_3x3/separable_conv2d (346.85MB/520.28MB) + tower_3/SepConv2d_2a_3x3/separable_conv2d/depthwise (173.43MB/173.43MB) + tower_2/SepConv2d_2a_3x3/separable_conv2d (346.85MB/520.28MB) + tower_2/SepConv2d_2a_3x3/separable_conv2d/depthwise (173.43MB/173.43MB) + tower_0/SepConv2d_2a_3x3/separable_conv2d (346.85MB/520.28MB) + tower_0/SepConv2d_2a_3x3/separable_conv2d/depthwise (173.43MB/173.43MB) + ... + +# code view. +tfprof> code -max_depth 10 -select bytes -order_by bytes -start_name_regexes .*seq2seq.* -min_bytes 1 +node name | output bytes +_TFProfRoot (--/74148.60MB) + seq2seq_attention.py'>:168:run_filename_from...:none (0B/74148.60MB) + seq2seq_attention.py'>:33:_run_code_in_main:none (0B/74148.60MB) + seq2seq_attention.py:316:<module>:app.run() (0B/74148.60MB) + app.py:432:run:_run_main(main or... (0B/74148.60MB) + app.py:352:_run_main:sys.exit(main(arg... (0B/74148.60MB) + seq2seq_attention.py:270:main:_Train(model, bat... (0B/74148.60MB) + seq2seq_attention.py:128:_Train:model.build_graph() (0B/74148.60MB) + seq2seq_attention_model.py:363:build_graph:self._add_train_o... (0B/48931.86MB) + seq2seq_attention_model.py:307:_add_train_op:tf.gradients(self... (0B/46761.06MB) + seq2seq_attention_model.py:322:_add_train_op:zip(grads, tvars)... (0B/2170.80MB) + seq2seq_attention_model.py:312:_add_train_op:tf.train.exponent... (0B/2.56KB) + seq2seq_attention_model.py:308:_add_train_op:tf.summary.scalar... (0B/64B) + seq2seq_attention_model.py:320:_add_train_op:tf.summary.scalar... (0B/64B) + seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0B/25216.74MB) + seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a... (0B/21542.55MB) +```
\ No newline at end of file diff --git a/tensorflow/core/profiler/g3doc/profile_model_architecture.md b/tensorflow/core/profiler/g3doc/profile_model_architecture.md new file mode 100644 index 0000000000..6d49cdcb75 --- /dev/null +++ b/tensorflow/core/profiler/g3doc/profile_model_architecture.md @@ -0,0 +1,92 @@ +##Profile Model Architecture + +* [Profile Model Parameters](#profile-model-parameters) +* [Profile Model Float Operations](#profile-model-float-operations) + +###Profile Model Parameters + +<b>Notes:</b> +`VariableV2` operation type might contain variables created by TensorFlow +implicitly. User normally don't want to count them as "model capacity". +We can use customized operation type to select a subset of variables. +For example `_trainable_variables` is created automatically by tfprof Python +API. User can also define customized operation type. + +``` +# parameters are created by operation type 'VariableV2' (For older model, +# it's 'Variable'). scope view is usually suitable in this case. +tfprof> scope -account_type_regexes VariableV2 -max_depth 4 -select params +_TFProfRoot (--/930.58k params) + global_step (1/1 params) + init/init_conv/DW (3x3x3x16, 432/864 params) + pool_logit/DW (64x10, 640/1.28k params) + pool_logit/DW/Momentum (64x10, 640/640 params) + pool_logit/biases (10, 10/20 params) + pool_logit/biases/Momentum (10, 10/10 params) + unit_last/final_bn/beta (64, 64/128 params) + unit_last/final_bn/gamma (64, 64/128 params) + unit_last/final_bn/moving_mean (64, 64/64 params) + unit_last/final_bn/moving_variance (64, 64/64 params) + +# The Python API profiles tf.trainable_variables() instead of VariableV2. +# +# By default, it's printed to stdout. User can update options['output'] +# to write to file. The result is always returned as a proto buffer. +param_stats = tf.profiler.profile( + tf.get_default_graph(), + options=tf.contrib.tfprof.model_analyzer. + TRAINABLE_VARS_PARAMS_STAT_OPTIONS) +sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) +``` + +###Profile Model Float Operations + +####Caveats + +For an operation to have float operation statistics: + +* It must have `RegisterStatistics('flops')` defined in TensorFlow. tfprof +use the definition to calculate float operations. Contributes are welcome. + +* It must have known "shape" information for RegisterStatistics('flops') +to calculate the statistics. It is suggested to pass in `-run_meta_path` if +shape is only known during runtime. tfprof can fill in the missing shape with +the runtime shape information from RunMetadata. +Hence, it is suggested to use `-account_displayed_name_only` +option so that you know the statistics are only for the operations printed out. + +* If no RunMetadata provided, tfprof count float_ops of each graph node once, +even if it is defined in tf.while_loop. This is because tfprof doesn't know +how many times are run statically. If RunMetadata provided, tfprof calculate +float_ops as float_ops * run_count. + + + +```python +# To profile float opertions in commandline, you need to pass --graph_path +# and --op_log_path. +tfprof> scope -min_float_ops 1 -select float_ops -account_displayed_op_only +node name | # float_ops +_TFProfRoot (--/17.63b flops) + gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul (163.84k/163.84k flops) + gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul_1 (163.84k/163.84k flops) + init/init_conv/Conv2D (113.25m/113.25m flops) + pool_logit/xw_plus_b (1.28k/165.12k flops) + pool_logit/xw_plus_b/MatMul (163.84k/163.84k flops) + unit_1_0/sub1/conv1/Conv2D (603.98m/603.98m flops) + unit_1_0/sub2/conv2/Conv2D (603.98m/603.98m flops) + unit_1_1/sub1/conv1/Conv2D (603.98m/603.98m flops) + unit_1_1/sub2/conv2/Conv2D (603.98m/603.98m flops) + +# Some might prefer op view that aggregate by operation type. +tfprof> op -min_float_ops 1 -select float_ops -account_displayed_op_only -order_by float_ops +node name | # float_ops +Conv2D 17.63b float_ops (100.00%, 100.00%) +MatMul 491.52k float_ops (0.00%, 0.00%) +BiasAdd 1.28k float_ops (0.00%, 0.00%) + +# You can also do that in Python API. +tf.profiler.profile( + tf.get_default_graph(), + options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) +``` diff --git a/tensorflow/core/profiler/g3doc/profile_time.md b/tensorflow/core/profiler/g3doc/profile_time.md new file mode 100644 index 0000000000..db555b3617 --- /dev/null +++ b/tensorflow/core/profiler/g3doc/profile_time.md @@ -0,0 +1,179 @@ +##Profile Time + +* [Times in TensorFlow and tfprof](#times-in-tensorflow-and-tfprof) +* [Profile by Python Code](#profile-by-python-code) +* [Profile by Operation Type](#profile-by-operation-type) +* [Profile by Graph](#profile-by-graph) +* [Profile by Name Scope](#profile-by-name-scope) + + +###Times in TensorFlow and tfprof +When we run a model, Tensorflow schedules and runs the nodes (operations) +in the graph. An operation can be placed on an accelerator or on CPU. + + +#### On Accelerator +When an operation is placed on accelerator, it will first be scheduled +by TensorFlow on CPU. Normally, it's the code in OpKernel::Compute. +OpKernel::Compute can decide to dispatch some of the computations on the +accelerator. While some computation (e.g. pre-processing) is still done +in CPU. OpKernel::Compute can dispatch computation on accelerator +and return, or it can also wait for the accelerator to finish. + +tfprof reports 3 execution times: + + * <b>accelerator_micros</b>, which is the part of computation time spent on accelerator. + * <b>cpu_micros</b>, which is the part of computation time spent on cpu, including + any wait times that might happen if OpKernel::Compute decides to wait. + * <b>exec_micros</b>, which is the sum of accelerator_micros and cpu_micros. + +Since accelerator, such as GPU, usually runs operation asynchronously, you +might notice an operation finishes on cpu before it starts running on +accelerator. + +#### On CPU +When an operation is placed on CPU, it will completely run on CPU. Hence, +<b>exec_micros</b> is equal to <b>cpu_micros</b> and <b>accelerator_micros</b> +should be 0. + + +###Profile by Python Code +```python +# In code view, the time of each line of Python code is the aggregated +# times of all operations created by that line. +# In command line, it requires --graph_path --op_log_path and --run_meta_path. +# --op_log_path provides the code traces information. +# --run_meta_path provides the time information. + +tfprof> code -show_name_regexes seq2seq_attention.* -max_depth 10 -select micros -order_by micros +node name | execution time +_TFProfRoot (--/3.74sec) + seq2seq_attention.py'>:168:run_filename_from...:none (0us/3.74sec) + seq2seq_attention.py'>:33:_run_code_in_main:none (0us/3.74sec) + seq2seq_attention.py:316:<module>:app.run() (0us/3.74sec) + seq2seq_attention.py:270:main:_Train(model, bat... (0us/3.74sec) + seq2seq_attention.py:128:_Train:model.build_graph() (0us/3.74sec) + seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0us/2.79sec) + seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ... (0us/2.46sec) + seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a... (0us/265.31ms) + seq2seq_attention_model.py:253:_add_seq2seq:initial_state_att... (0us/50.35ms) + seq2seq_attention_model.py:173:_add_seq2seq:for x in encoder_... (0us/8.72ms) + seq2seq_attention_model.py:218:_add_seq2seq:w_t = tf.transpos... (0us/2.39ms) + ... + seq2seq_attention_model.py:363:build_graph:self._add_train_o... (0us/949.10ms) + seq2seq_attention_model.py:307:_add_train_op:tf.gradients(self... (0us/641.44ms) + seq2seq_attention_model.py:322:_add_train_op:zip(grads, tvars)... (0us/307.56ms) + ... + seq2seq_attention_model.py:364:build_graph:self._summaries =... (0us/13us) + seq2seq_attention_model.py:361:build_graph:self.global_step ... (0us/12us) + ... + seq2seq_attention.py:129:_Train:saver = tf.train.... (0us/0us) + seq2seq_attention.py:140:_Train:global_step=model... (0us/0us) + +# Sometimes you want to explore a specific function. You can do that +# with -start_name_regexes. +tfprof> code -start_name_regexes .*_add_seq2seq.* -show_name_regexes seq2seq_attention.* -max_depth 10 -select micros -order_by micros +node name | execution time +_TFProfRoot (--/3.74sec) + seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0us/2.79sec) + seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ... (0us/2.46sec) + seq2seq_attention_model.py:289:sampled_loss_func:num_classes=vsize) (0us/2.46sec) + seq2seq_attention_model.py:282:sampled_loss_func:labels = tf.resha... (0us/164us) + +# You can also dive deeper into tensorflow's libraries. +tfprof> code -max_depth 5 -select micros -order_by micros -start_name_regexes .*_add_seq2seq.* -min_micros 100000 +_TFProfRoot (--/3.74sec) + seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0us/2.79sec) + seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ... (0us/2.46sec) + seq2seq_lib.py:181:sampled_sequence_...:average_across_ti... (0us/2.46sec) + seq2seq_lib.py:147:sequence_loss_by_...:crossent = loss_f... (0us/2.46sec) + seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a... (0us/265.31ms) + seq2seq_lib.py:104:bidirectional_rnn:sequence_length, ... (0us/127.27ms) + core_rnn.py:195:static_rnn:state_size=cell.s... (0us/127.20ms) + seq2seq_lib.py:110:bidirectional_rnn:initial_state_bw,... (0us/125.96ms) + core_rnn.py:195:static_rnn:state_size=cell.s... (0us/125.86ms) + + +# It can also be done in Python API +opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy() +opts['account_type_regexes'] = ['.*'] +opts['show_name_regexes'] = ['.*model_analyzer_testlib.py.*'] +opts['account_displayed_op_only'] = False +opts['select'] = ['micros'] + +tfprof_node = model_analyzer.print_model_analysis( + sess.graph, run_meta, cmd='code', options=opts) +``` + +You can generate some visualization in code view: +Set ```-output timeline:outfile=<filename>``` to generate timeline instead of stdout. +<left> +![CodeTimeline](code_timeline.png) +</left> + + +###Profile by Operation Type +```python +# In op view, you can view the aggregated time of each operation type. +tfprof> op -select micros,occurrence -order_by micros +node name | execution time | op occurrence +SoftmaxCrossEntropyWithLogits 1.37sec (100.00%, 36.44%), 30 +MatMul 618.97ms (63.56%, 16.51%), 3450 +Add 273.76ms (47.06%, 7.30%), 2180 +Sub 215.41ms (39.76%, 5.74%), 4372 +ConcatV2 203.88ms (34.01%, 5.44%), 6098 +Mul 134.32ms (28.58%, 3.58%), 9427 +ApplyAdam 92.66ms (25.00%, 2.47%), 27 +Switch 72.43ms (22.53%, 1.93%), 30654 +LogUniformCandidateSampler 69.01ms (20.59%, 1.84%), 30 +Unique 53.50ms (18.75%, 1.43%), 2 +AddN 50.10ms (17.33%, 1.34%), 5481 + +# You might be surprised to see that SoftmaxCrossEntropyWithLogits is +# that expensive. As shown below, it is placed on cpu. +tfprof> op -select micros,device -order_by micros +node name | execution time | assigned devices +SoftmaxCrossEntropyWithLogits 1.37sec (100.00%, 36.44%), /job:worker/replica:0/task:0/cpu:0 +MatMul 618.97ms (63.56%, 16.51%), |/job:worker/replica:0/task:0/cpu:0|/job:worker/replica:0/task:0/gpu:0|/job:worker/replica:0/task:0/gpu:1|/job:worker/replica:0/task:0/gpu:2|/job:worker/replica:0/task:0/gpu:3 +``` + + +###Profile by Graph + +Usually, use graph view to generate a timeline to visualize the result. + +In the chrome://tracing UI, click "Flow Event" in "View Options" of upper +right corner to see the flow of tensors. + +<left> +TODO(xpan): Show the image correctly in github. +![Timeline](graph_timeline.png) +</left> + +tfprof options allow users to generate timeline in some advanced ways. + +```python +# Only generate timeline for gpu3 and cpu on workers. +graph -max_depth 10000000 -step 0 -account_type_regexes .*gpu:3.*,.*worker.*cpu:0.* -output timeline:outfile=<filename.json> +generating trace file. + +****************************************************** +Timeline file is written to <filename.json>. +Open a Chrome browser, enter URL chrome://tracing and load the timeline file. +****************************************************** +``` + +###Profile by Name Scope + +Usually scope view allows you to pin point the problematic places if you +have properly named your operations with tf.name_scope or tf.variable_scope. + +```python +tfprof> scope -max_depth 30 -select micros -min_micros 100000 -order_by micros +node name | execution time +_TFProfRoot (--/8.12sec) + tower_3/gradients/tower_3/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (126.34ms/126.34ms) + tower_1/gradients/tower_1/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (125.44ms/125.44ms) + tower_2/gradients/tower_2/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (124.85ms/124.85ms) + tower_0/gradients/tower_0/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (124.45ms/124.45ms) +``` diff --git a/tensorflow/core/profiler/g3doc/python_api.md b/tensorflow/core/profiler/g3doc/python_api.md new file mode 100644 index 0000000000..5dda419711 --- /dev/null +++ b/tensorflow/core/profiler/g3doc/python_api.md @@ -0,0 +1,144 @@ +## Python API Tutorials + +* [Parameters and Shapes](#parameters-and-shapes) +* [Float Operations](#float-operations) +* [Time and Memory](#time-and-memory) +* [Visualize](#visualize) +* [Multi-step Profiling](#multi-step-profiling) + +```import tensorflow as tf```. + +### Parameters and Shapes. +```python +# Print trainable variable parameter statistics to stdout. +param_stats = tf.profiler.profile( + tf.get_default_graph(), + options=tf.contrib.tfprof.model_analyzer. + TRAINABLE_VARS_PARAMS_STAT_OPTIONS) + +# Use code view to associate statistics with Python codes. +opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS +opts['show_name_regexes'] = ['.*my_code1.py.*', '.*my_code2.py.*'] +param_stats = tf.profiler.profile( + tf.get_default_graph(), + cmd='code' + options=opts) + +# param_stats can be tensorflow.tfprof.TFGraphNodeProto or +# tensorflow.tfprof.TFMultiGraphNodeProto, depending on the view. +# Let's print the root below. +sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) +``` + +### Float Operations + +#### Note: See [Caveats](profile_model_architecture.md#caveats) in "Profile Model Architecture" Tutorial +``` python +# Print to stdout an analysis of the number of floating point operations in the +# model broken down by individual operations. +tf.profiler.profile( + tf.get_default_graph(), + options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS) +``` + +### Time and Memory +You will first need to run the following set up in your model in order to +compute the memory and timing statistics. + +```python +# Generate the RunMetadata that contains the memory and timing information. +# +# Note: When run on GPU, a kernel is first scheduled (enqueued) and then +# executed asynchronously. tfprof only tracks the execution time. +# +run_metadata = tf.RunMetadata() +with tf.Session() as sess: + _ = sess.run(train_op, + options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), + run_metadata=run_metadata) +``` + +Finally, you may run `print_model_analysis` to explore the timing and memory +information of the model. + +``` python +# See model_analyzer_test.py for more examples. +# +# Print to stdout an analysis of the memory usage and the timing information +# broken down by python codes. +opts = tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY.copy() +opts['show_name_regexes'] = ['.*my_code.py.*'] +tf.profiler.profile( + tf.get_default_graph(), + run_meta=run_metadata, + cmd='code', + options=opts) + +# Print to stdout an analysis of the memory usage and the timing information +# broken down by operations. +tf.profiler.profile( + tf.get_default_graph(), + run_meta=run_metadata, + options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY) +``` + +### Visualize + +``` +To visualize the result of Python API results: +Set opts['output'] = 'timeline:outfile=<filename>' to generate a timeline json file. +Open a Chrome Browser, open URL chrome://tracing, and load the json file. +``` + +Below are 2 examples of graph view and scope view. See code view example in later examples. + +<left> +![CodeTimeline](graph_timeline.png) +![CodeTimeline](scope_timeline.png) +</left> + +### Multi-step Profiling + +tfprof allows you to profile statistics across multiple steps. + +```python +opts = model_analyzer.PRINT_ALL_TIMING_MEMORY.copy() +opts['account_type_regexes'] = ['.*'] + +with session.Session() as sess: + r1, r2, r3 = lib.BuildSplitableModel() + sess.run(variables.global_variables_initializer()) + + # Create a profiler. + profiler = model_analyzer.Profiler(sess.graph) + # Profile without RunMetadata of any step. + pb0 = profiler.profile_name_scope(opts) + + run_meta = config_pb2.RunMetadata() + _ = sess.run(r1, + options=config_pb2.RunOptions( + trace_level=config_pb2.RunOptions.FULL_TRACE), + run_metadata=run_meta) + + # Add run_meta of step 1. + profiler.add_step(1, run_meta) + pb1 = profiler.profile_name_scope(opts) + + run_meta2 = config_pb2.RunMetadata() + _ = sess.run(r2, + options=config_pb2.RunOptions( + trace_level=config_pb2.RunOptions.FULL_TRACE), + run_metadata=run_meta2) + # Add run_meta of step 2. + profiler.add_step(2, run_meta2) + pb2 = profiler.profile_name_scope(opts) + + run_meta3 = config_pb2.RunMetadata() + _ = sess.run(r3, + options=config_pb2.RunOptions( + trace_level=config_pb2.RunOptions.FULL_TRACE), + run_metadata=run_meta3) + # Add run_meta of step 3. + profiler.add_step(3, run_meta3) + pb3 = profiler.profile_name_scope(opts) +```
\ No newline at end of file diff --git a/tensorflow/core/profiler/g3doc/scope_timeline.png b/tensorflow/core/profiler/g3doc/scope_timeline.png Binary files differnew file mode 100644 index 0000000000..c6d95af84a --- /dev/null +++ b/tensorflow/core/profiler/g3doc/scope_timeline.png |