38 files changed, 1470 insertions, 668 deletions
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
index d891ecdc9a..5bfa0247a5 100644
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@@ -2,81 +2,25 @@
 
 # Full Docment in tensorflow/tools/tfprof/README.md
 
-Author: Xin Pan (xpan@google.com, github: panyx0718)
+Author: Xin Pan (xpan@google.com, github: panyx0718), Jon Shlens, Yao Zhang
 
 Consultants: Jon Shlens, Pete Warden
 
 ###Major Features
 
 1.  Measure model parameters, float operations, tensor shapes.
-2.  Measure op execution times, requested memory size and device placement.
+2.  Profile op execution times, requested memory size and device placement.
 3.  Inspect checkpoint tensors' shapes and their values.
-4.  3 ways to view and explore TensorFlow model profiles
+4.  Selectively group, filter, account and order ops.
 
-    *  Organize by Python code call stack.
-    *  Organize by TensorFlow operation name scope hierarchies.
-    *  Organize by TensorFlow operation inputs/outputs graph.
+####tfprof supports 3 views to organize TensorFlow model profiles
 
-5.  Selectively grouping/filtering/accounting/ordering ops.
+    *  code view: Stats are associated your Python codes and organized as call stacks.
+    *  scope view: Stats are organized as name scope hierarchies.
+    *  graph view: Stats are organized as Tensorflow Op graph.
 
-tfprof can be used as Python API, Interactive CLI and One-shot Script.
+####For each view, there are 3 ways to display outputs:
 
-## Python API Tutorials
-
-tfprof is part of TensorFlow core. Simply ```import tensorflow as tf```.
-
-### Examine the shapes and sizes of all trainiable Variables.
-```python
-# Print trainable variable parameter statistics to stdout.
-param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.
-        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
-
-# param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
-# It organize the statistics
-# of each graph node in tree scructure. Let's print the root below.
-sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
-```
-
-### Examine the number of floating point operations
-``` python
-# Print to stdout an analysis of the number of floating point operations in the
-# model broken down by individual operations.
-#
-# Note: Only Ops with RegisterStatistics('flops') defined have flop stats. It
-# also requires complete shape information. It is common that shape is unknown
-# statically. To complete the shape, provide run-time shape information with
-# tf.RunMetadata to the API (See next example on how to provide RunMetadata).
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
-```
-
-### Examine the timing and memory usage
-You will first need to run the following set up in your model in order to
-compute the memory and timing statistics.
-
-```python
-# Generate the meta information for the model that contains the memory usage
-# and timing information.
-run_metadata = tf.RunMetadata()
-with tf.Session() as sess:
-  _ = sess.run(train_op,
-               options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
-               run_metadata=run_metadata)
-```
-
-Finally, you may run `print_model_analysis` to explore the timing and memory
-demands of the model.
-
-``` python
-# Print to stdout an analysis of the memory usage and the timing information
-# from running the graph broken down by operations.
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    run_meta=run_metadata,
-    tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)
-```
-
-Users can change ```tfprof_options``` to fully leverage tfprof's power.
+    *  stdout: Results are written to stdout.
+    *  timeline: Visualized in chrome browser as time series.
+    *  file: Results are dumped to file.
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
index 13b407d815..17dff69edd 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
@@ -45,7 +45,7 @@ TRAINABLE_VARS_PARAMS_STAT_OPTIONS = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['params'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -65,7 +65,7 @@ FLOAT_OPS_OPTIONS = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['float_ops'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -87,7 +87,7 @@ PRINT_PARAMS_ON_DEVICE = {
     'hide_name_regexes': [],
     'account_displayed_op_only': False,
     'select': ['device', 'params'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -107,7 +107,7 @@ PRINT_ALL_TIMING_MEMORY = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['micros', 'bytes'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -178,7 +178,7 @@ def print_model_analysis(graph,
   opts.account_displayed_op_only = tfprof_options['account_displayed_op_only']
   for p in tfprof_options['select']:
     opts.select.append(p)
-  opts.viz = tfprof_options['viz']
+  opts.output = tfprof_options['output']
   opts.dump_to_file = tfprof_options['dump_to_file']
 
   run_meta_str = run_meta.SerializeToString() if run_meta else b''
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
index 55167576e2..afd8563e78 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
@@ -35,13 +35,14 @@ class PrintModelAnalysisTest(test.TestCase):
   def testDumpToFile(self):
     ops.reset_default_graph()
     opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
 
     with session.Session() as sess, ops.device('/cpu:0'):
       _ = lib.BuildSmallModel()
       model_analyzer.print_model_analysis(sess.graph, tfprof_options=opts)
 
-      with gfile.Open(opts['dump_to_file'], 'r') as f:
+      with gfile.Open(outfile, 'r') as f:
         self.assertEqual(u'_TFProfRoot (--/451 params)\n'
                          '  DW (3x3x3x6, 162/162 params)\n'
                          '  DW2 (2x2x6x12, 288/288 params)\n'
@@ -51,15 +52,14 @@ class PrintModelAnalysisTest(test.TestCase):
   def testSelectEverything(self):
     ops.reset_default_graph()
     opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
     opts['account_type_regexes'] = ['.*']
     opts['select'] = [
         'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device', 'op_types'
     ]
 
-    config = config_pb2.ConfigProto(
-        graph_options=config_pb2.GraphOptions(build_cost_model=1))
-    with session.Session(config=config) as sess, ops.device('/cpu:0'):
+    with session.Session() as sess, ops.device('/cpu:0'):
       x = lib.BuildSmallModel()
 
       sess.run(variables.global_variables_initializer())
@@ -72,17 +72,18 @@ class PrintModelAnalysisTest(test.TestCase):
       model_analyzer.print_model_analysis(
           sess.graph, run_meta, tfprof_options=opts)
 
-      with gfile.Open(opts['dump_to_file'], 'r') as f:
+      with gfile.Open(outfile, 'r') as f:
         # pylint: disable=line-too-long
         self.assertEqual(
-            '_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB, _kTFScopeParent)\n  Conv2D (0/0 params, 5.83k/5.83k flops, 432B/432B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, 384B/384B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 648B/1.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    DW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW/read (0/0 params, 0/0 flops, 648B/648B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/2.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW2/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    DW2/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW2/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW2/read (0/0 params, 0/0 flops, 1.15KB/1.15KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|VariableV2|_trainable_variables)\n    ScalarW/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    ScalarW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      ScalarW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        ScalarW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        ScalarW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        ScalarW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    ScalarW/read (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Identity)\n  init (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|NoOp)\n  zeros (0/0 params, 0/0 flops, 864B/864B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Const)\n',
+            '_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB, _kTFScopeParent)\n  Conv2D (0/0 params, 5.83k/5.83k flops, 432B/432B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, 384B/384B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 648B/1.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW/Assign (0/0 params, 0/0 flops, 0B/0B, Assign)\n    DW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW/read (0/0 params, 0/0 flops, 648B/648B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/2.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW2/Assign (0/0 params, 0/0 flops, 0B/0B, Assign)\n    DW2/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW2/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW2/read (0/0 params, 0/0 flops, 1.15KB/1.15KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, VariableV2|_trainable_variables)\n    ScalarW/Assign (0/0 params, 0/0 flops, 0B/0B, Assign)\n    ScalarW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      ScalarW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        ScalarW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        ScalarW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        ScalarW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    ScalarW/read (0/0 params, 0/0 flops, 0B/0B, Identity)\n  init (0/0 params, 0/0 flops, 0B/0B, NoOp)\n  zeros (0/0 params, 0/0 flops, 864B/864B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Const)\n',
             f.read())
         # pylint: enable=line-too-long
 
   def testSimpleCodeView(self):
     ops.reset_default_graph()
     opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
-    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
     opts['account_type_regexes'] = ['.*']
     opts['show_name_regexes'] = ['.*model_analyzer_testlib.*']
     opts['account_displayed_op_only'] = False
@@ -92,9 +93,7 @@ class PrintModelAnalysisTest(test.TestCase):
         'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device',
     ]
 
-    config = config_pb2.ConfigProto(
-        graph_options=config_pb2.GraphOptions(build_cost_model=1))
-    with session.Session(config=config) as sess, ops.device('/cpu:0'):
+    with session.Session() as sess, ops.device('/cpu:0'):
       x = lib.BuildSmallModel()
 
       sess.run(variables.global_variables_initializer())
@@ -107,7 +106,7 @@ class PrintModelAnalysisTest(test.TestCase):
       model_analyzer.print_model_analysis(
           sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
 
-      with gfile.Open(opts['dump_to_file'], 'r') as f:
+      with gfile.Open(outfile, 'r') as f:
         # pylint: disable=line-too-long
         self.assertEqual(
             '_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB)\n  model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops, 0B/864B)\n  model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/1 params, 0/0 flops, 0B/0B)\n  model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/162 params, 0/0 flops, 0B/1.30KB)\n  model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/5.83k flops, 0B/432B)\n  model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/288 params, 0/0 flops, 0B/2.30KB)\n  model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/4.61k flops, 0B/384B)\n',
@@ -117,15 +116,14 @@ class PrintModelAnalysisTest(test.TestCase):
   def testComplexCodeView(self):
     ops.reset_default_graph()
     opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
-    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
     opts['account_type_regexes'] = ['.*']
     opts['show_name_regexes'] = ['.*model_analyzer_testlib.py.*']
     opts['account_displayed_op_only'] = False
     opts['select'] = ['params', 'float_ops']
 
-    config = config_pb2.ConfigProto(
-        graph_options=config_pb2.GraphOptions(build_cost_model=1))
-    with session.Session(config=config) as sess, ops.device('/cpu:0'):
+    with session.Session() as sess, ops.device('/cpu:0'):
       x = lib.BuildFullModel()
 
       sess.run(variables.global_variables_initializer())
@@ -139,7 +137,7 @@ class PrintModelAnalysisTest(test.TestCase):
           sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
 
       # pylint: disable=line-too-long
-      with gfile.Open(opts['dump_to_file'], 'r') as f:
+      with gfile.Open(outfile, 'r') as f:
         self.assertEqual(
             '_TFProfRoot (0/2.84k params, 0/54.08k flops)\n  model_analyzer_testlib.py:56:BuildFullModel:seq.append(array_... (0/1.80k params, 0/41.76k flops)\n    model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops)\n    model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/4 params, 0/0 flops)\n    model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/648 params, 0/0 flops)\n    model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/23.33k flops)\n    model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/1.15k params, 0/0 flops)\n    model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/18.43k flops)\n  model_analyzer_testlib.py:60:BuildFullModel:cell, array_ops.c... (0/1.04k params, 0/4.13k flops)\n  model_analyzer_testlib.py:62:BuildFullModel:target = array_op... (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:63:BuildFullModel:loss = nn_ops.l2_... (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:65:BuildFullModel:return sgd_op.min... (0/0 params, 0/8.19k flops)\n',
             f.read())
@@ -170,9 +168,7 @@ class PrintModelAnalysisTest(test.TestCase):
         'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device'
     ]
 
-    config = config_pb2.ConfigProto(
-        graph_options=config_pb2.GraphOptions(build_cost_model=1))
-    with session.Session(config=config) as sess, ops.device('/cpu:0'):
+    with session.Session() as sess, ops.device('/cpu:0'):
       x = lib.BuildSmallModel()
 
       sess.run(variables.global_variables_initializer())
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
index aa133d3142..c3e9fc9cc0 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
@@ -51,7 +51,7 @@ TEST_OPTIONS = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['params'],
-    'viz': False
+    'output': 'stdout',
 }
 
 # pylint: enable=bad-whitespace
@@ -92,7 +92,7 @@ class PrintModelAnalysisTest(test.TestCase):
     opts.account_displayed_op_only = TEST_OPTIONS['account_displayed_op_only']
     for p in TEST_OPTIONS['select']:
       opts.select.append(p)
-    opts.viz = TEST_OPTIONS['viz']
+    opts.output = TEST_OPTIONS['output']
 
     with session.Session() as sess, ops.device('/cpu:0'):
       _ = self._BuildSmallModel()
@@ -116,7 +116,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
@@ -128,7 +127,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 648
-      device: "/device:CPU:0"
       children {
       name: "DW/Assign"
       exec_micros: 0
@@ -136,7 +134,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
@@ -217,7 +214,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
@@ -231,7 +227,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
index cd3912bbfb..e6d504d516 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
@@ -62,7 +62,7 @@ def _fill_missing_graph_shape(graph, run_meta):
   return graph
 
 
-def _get_logged_ops(graph, run_meta=None, add_trace=False):
+def _get_logged_ops(graph, run_meta=None, add_trace=True):
   """Extract trainable model parameters and FLOPs for ops from a Graph.
 
   Args:
@@ -120,9 +120,8 @@ def _get_logged_ops(graph, run_meta=None, add_trace=False):
   return logged_ops
 
 
-def _merge_default_with_oplog(graph, op_log=None,
-                              run_meta=None,
-                              add_trace=False):
+def _merge_default_with_oplog(graph, op_log=None, run_meta=None,
+                              add_trace=True):
   """Merge the tfprof default extra info with caller's op_log.
 
   Args:
@@ -154,8 +153,7 @@ def _merge_default_with_oplog(graph, op_log=None,
   return tmp_op_log
 
 
-def write_op_log(graph, log_dir, op_log=None, run_meta=None,
-                 add_trace=False):
+def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
     The API also assigns ops in tf.trainable_variables() an op type called
diff --git a/tensorflow/tools/tfprof/README.md b/tensorflow/tools/tfprof/README.md
index 52d376d5f5..69f09411a9 100644
--- a/tensorflow/tools/tfprof/README.md
+++ b/tensorflow/tools/tfprof/README.md
@@ -1,6 +1,6 @@
 # tfprof: A Profiling Tool for TensorFlow Models
 
-Author: Xin Pan (xpan@google.com, github: panyx0718)
+Author: Xin Pan (xpan@google.com, github: panyx0718), Jon Shlens, Yao Zhang
 
 Consultants: Jon Shlens, Pete Warden
 
@@ -8,15 +8,22 @@ Consultants: Jon Shlens, Pete Warden
 ###Major Features
 
 1.  Measure model parameters, float operations, tensor shapes.
-2.  Measure op execution times, requested memory size and device placement.
+2.  Profile op execution times, requested memory size and device placement.
 3.  Inspect checkpoint tensors' shapes and their values.
-4.  3 ways to view and explore TensorFlow model profiles
+4.  Selectively group, filter, account and order ops.
 
-    *  Organize by Python code call stack.
-    *  Organize by TensorFlow operation name scope hierarchies.
-    *  Organize by TensorFlow operation inputs/outputs graph.
+####tfprof supports 3 views to organize TensorFlow model profiles
+
+    *  code view: Stats are associated your Python codes and organized as call stacks.
+    *  scope view: Stats are organized as name scope hierarchies.
+    *  graph view: Stats are organized as Tensorflow Op graph.
+
+####For each view, there are 3 ways to display outputs:
+
+    *  stdout: Results are written to stdout.
+    *  timeline: Visualized in chrome browser as time series.
+    *  file: Results are dumped to file.
 
-5.  Selectively grouping/filtering/accounting/ordering ops.
 
 [Python API Tutorials](#python-api-tutorials): It can be called directly from
 Python codes. Results are either printed
@@ -83,13 +90,11 @@ compute the memory and timing statistics.
 #
 # Note: When run on GPU, a kernel is first scheduled (enqueued) and then
 #       executed asynchronously. tfprof only tracks the execution time.
-#       Which is from proto CostGraphDef::Node::compute_cost.
 #       In addition, a substantial of time might be spent between Python and
 #       TensorFlow runtime, which is also not tracked by tfprof.
 #
-config = tf.ConfigProto(graph_options=tf.GraphOptions(build_cost_model=1))
 run_metadata = tf.RunMetadata()
-with tf.Session(config=config) as sess:
+with tf.Session() as sess:
   _ = sess.run(train_op,
                options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
                run_metadata=run_metadata)
@@ -121,6 +126,18 @@ tf.contrib.tfprof.model_analyzer.print_model_analysis(
 
 Users can change ```tfprof_options``` to fully leverage tfprof's power.
 
+```
+For example set opts['output'] = 'timeline:outfile=<filename>' to
+generate a timeline json file. Open a Chrome Browser, open URL
+chrome://tracing, and load the json file. Below are 2 examples of graph
+view and scope view. See code view example in later examples.
+```
+
+<left>
+![CodeTimeline](g3doc/graph_timeline.png)
+![CodeTimeline](g3doc/scope_timeline.png)
+</left>
+
 
 ## CLI Tutorials
 
@@ -197,8 +214,14 @@ tfprof>
 # supported select fileds. Availability depends on --[run_meta|checkpoint|op_log]_path.
 # [bytes|micros|params|float_ops|num_hidden_ops|tensor_value|device|op_types]
 -select                     params
--viz                        false
--dump_to_file
+# format: output_type:key=value,key=value...
+# output_types: stdout (default), timeline, file.
+# key=value pairs:
+#   1. timeline: outfile=<filename>
+#   2. file: outfile=<filename>
+#   3. stdout: None.
+# E.g. timeline:outfile=/tmp/timeline.json
+-output
 ```
 
 3) I want to see which line of my python codes costs most time!
@@ -222,6 +245,12 @@ _TFProfRoot (0us/22.44ms)
         model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
 ```
 
+Set ```-output timeline:outfile=<filename>``` to generate timeline instead of stdout.
+<left>
+![CodeTimeline](g3doc/code_timeline.png)
+</left>
+
+
 4) I want to see the `BatchNorm`'s gamma value in checkpoint.
 
 ```shell
@@ -423,10 +452,10 @@ the tool adds all `Variables` inside `tf.trainable_variables()` to
 12) Run tfprof in one-shot mode and dump result to file.
 
 ```shell
-# Printed to stdout if --dump_to_file is not set.
+# By default output to stdout. Use -output option to change output types.
 tfprof scope --graph_path=graph.pbtxt  \
              --max_depth=3 \
-             --dump_to_file="/tmp/dump"
+             --output="file:outfile=/tmp/dump"
 Reading Files...
 Parsing GraphDef...
 Preparing Views...
@@ -538,4 +567,9 @@ as long as they match the `-account_xxx` options.
 
 `-select`: Comma-separated list of metrics to show: [bytes|micros|params|float_ops|num_hidden_ops|tensor_value|device|op_types].
 
-`-dump_to_file`: Dump the output to a file, instead of terminal.
+`-output`: Output results as stdout, file or timeline.
+The format is ```output_type:key=value,key=value```.
+For example: ```timeline:outfile=<filename>```.
+timeline: key=outfile, value=<filename>.
+stdout: none.
+file: key=outfile, value=<filename>.
diff --git a/tensorflow/tools/tfprof/g3doc/code_timeline.png b/tensorflow/tools/tfprof/g3doc/code_timeline.png
new file mode 100644
index 0000000000..c5ab246f7d
--- /dev/null
+++ b/tensorflow/tools/tfprof/g3doc/code_timeline.png
diff --git a/tensorflow/tools/tfprof/g3doc/graph_timeline.png b/tensorflow/tools/tfprof/g3doc/graph_timeline.png
new file mode 100644
index 0000000000..255a91fd5f
--- /dev/null
+++ b/tensorflow/tools/tfprof/g3doc/graph_timeline.png
diff --git a/tensorflow/tools/tfprof/g3doc/scope_timeline.png b/tensorflow/tools/tfprof/g3doc/scope_timeline.png
new file mode 100644
index 0000000000..c6d95af84a
--- /dev/null
+++ b/tensorflow/tools/tfprof/g3doc/scope_timeline.png
diff --git a/tensorflow/tools/tfprof/internal/BUILD b/tensorflow/tools/tfprof/internal/BUILD
index adace89985..e90f0ec40a 100644
--- a/tensorflow/tools/tfprof/internal/BUILD
+++ b/tensorflow/tools/tfprof/internal/BUILD
@@ -21,6 +21,7 @@ cc_library(
         ":tfprof_options",
         ":tfprof_scope",
         ":tfprof_show",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -31,6 +32,20 @@ cc_library(
 )
 
 cc_library(
+    name = "tfprof_timeline",
+    srcs = ["tfprof_timeline.cc"],
+    hdrs = ["tfprof_timeline.h"],
+    deps = [
+        ":tfprof_node_show",
+        ":tfprof_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
+
+cc_library(
     name = "tfprof_node",
     srcs = ["tfprof_node.cc"],
     hdrs = ["tfprof_node.h"],
@@ -71,7 +86,7 @@ cc_library(
         ":tfprof_node",
         ":tfprof_options",
         ":tfprof_show_code",
-        ":tfprof_tensor",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:c_api",
         "//tensorflow/c:checkpoint_reader",
@@ -93,6 +108,7 @@ cc_library(
         ":tfprof_options",
         ":tfprof_show",
         ":tfprof_tensor",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -103,14 +119,31 @@ cc_library(
 )
 
 cc_library(
+    name = "tfprof_node_show",
+    srcs = ["tfprof_node_show.cc"],
+    hdrs = ["tfprof_node_show.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
+cc_library(
     name = "tfprof_show",
     srcs = ["tfprof_show.cc"],
     hdrs = ["tfprof_show.h"],
     deps = [
         ":tfprof_constants",
         ":tfprof_node",
+        ":tfprof_node_show",
         ":tfprof_options",
         ":tfprof_tensor",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -127,10 +160,12 @@ cc_library(
     deps = [
         ":tfprof_constants",
         ":tfprof_node",
+        ":tfprof_node_show",
         ":tfprof_options",
         ":tfprof_scope",
         ":tfprof_show",
         ":tfprof_tensor",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -166,6 +201,27 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "tfprof_timeline_test",
+    srcs = ["tfprof_timeline_test.cc"],
+    data = [
+        "testdata/graph.pbtxt",
+        "testdata/run_meta",
+    ],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_options",
+        ":tfprof_stats",
+        ":tfprof_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tfprof_utils",
     srcs = ["tfprof_utils.cc"],
diff --git a/tensorflow/tools/tfprof/internal/print_model_analysis.cc b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
index c816e3209e..f73675e8a7 100644
--- a/tensorflow/tools/tfprof/internal/print_model_analysis.cc
+++ b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
@@ -56,11 +56,14 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
   TFStats tf_stats(std::move(graph_ptr), std::move(run_meta_ptr),
                    std::move(op_log_ptr), std::move(ckpt_reader));
 
-  Options opts = Options::FromProtoStr(*options);
+  Options opts;
+  tensorflow::Status s = Options::FromProtoStr(*options, &opts);
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.ToString().c_str());
+    return "";
+  }
 
-  // TODO(xpan): We should have dump_to_file/print_stdout/etc to control
-  // side-effects independently instead of one controlling the other.
-  if (opts.dump_to_file.empty()) {
+  if (opts.output_type == kOutput[1]) {
     printf("\n=========================Options=============================\n");
     printf("%s", opts.ToString().c_str());
     printf("\n==================Model Analysis Report======================\n");
diff --git a/tensorflow/tools/tfprof/internal/tfprof_code.cc b/tensorflow/tools/tfprof/internal/tfprof_code.cc
index 5618a52286..9739db1e0b 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_code.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_code.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
-#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -99,7 +98,8 @@ CodeNode* TFCode::BuildCodeNodes(TFCodeNode* root) {
   return code_root_ptr;
 }
 
-const ShowCodeNode* TFCode::ShowInternal(const Options& opts) {
+const ShowCodeNode* TFCode::ShowInternal(const Options& opts,
+                                         Timeline* timeline) {
   // Search from roots recursively to find start node, if start_name_regexes
   // is specified.
   tfprof_trace_root_.reset(new TFCodeNode(kTFProfRoot));
@@ -117,7 +117,11 @@ const ShowCodeNode* TFCode::ShowInternal(const Options& opts) {
   tfprof_code_root_->children.assign(roots.begin(), roots.end());
   Account({tfprof_code_root_.get()}, opts);
 
-  return PrintScope({tfprof_code_root_.get()}, opts, 1, 0)[0];
+  CodeNode* root = PrintScope({tfprof_code_root_.get()}, opts, 1, 0)[0];
+  if (timeline) {
+    timeline->GenerateCodeTimeline(root);
+  }
+  return root;
 }
 
 std::vector<CodeNode*> TFCode::SearchRoot(std::vector<CodeNode*> roots,
@@ -170,8 +174,13 @@ std::vector<CodeNode*> TFCode::PrintScope(const std::vector<CodeNode*> roots,
       show_cnodes = SortNodes(show_cnodes, opts);
       string children_str;
       for (CodeNode* sc : show_cnodes) {
-        children_str += sc->formatted_str;
+        if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
+          children_str += sc->formatted_str;
+          sc->formatted_str.clear();
+        }
         node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        sc->mutable_proto()->mutable_children()->Clear();
+        node->show_children.push_back(sc);
         if (opts.account_displayed_op_only) {
           node->AggregateTotalStats(sc);
         }
diff --git a/tensorflow/tools/tfprof/internal/tfprof_code.h b/tensorflow/tools/tfprof/internal/tfprof_code.h
index 7c517e6b0e..d7a28624f1 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_code.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_code.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/tools/tfprof/internal/tfprof_node.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_show_code.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
 #include "tensorflow/tools/tfprof/tfprof_log.pb.h"
 #include "tensorflow/tools/tfprof/tfprof_output.pb.h"
@@ -37,22 +38,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 
-class CodeNode : public ShowCodeNode {
- public:
-  explicit CodeNode(const TFCodeNode* node) : ShowCodeNode(node) {}
-  ~CodeNode() override {}
-
-  void AggregateTotalStats(CodeNode* node) {
-    ShowCodeNode::AggregateTotalStats(node);
-  }
-
-  void AddSelfToTotalStats() { ShowCodeNode::AddSelfToTotalStats(); }
-
-  void ResetTotalStats() { ShowCodeNode::ResetTotalStats(); }
-
-  std::vector<CodeNode*> children;
-};
-
 class TFCode : public TFShowCode {
  public:
   explicit TFCode() : code_root_(nullptr), trace_root_(nullptr) {}
@@ -65,7 +50,8 @@ class TFCode : public TFShowCode {
  private:
   CodeNode* BuildCodeNodes(TFCodeNode* root);
 
-  const ShowCodeNode* ShowInternal(const Options& opts) override;
+  const ShowCodeNode* ShowInternal(const Options& opts,
+                                   Timeline* timeline) override;
 
   std::vector<CodeNode*> SearchRoot(std::vector<CodeNode*> roots,
                                     const std::vector<string>& regexes);
diff --git a/tensorflow/tools/tfprof/internal/tfprof_graph.cc b/tensorflow/tools/tfprof/internal/tfprof_graph.cc
index 1623d9f8c4..23084146c2 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_graph.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_graph.cc
@@ -66,7 +66,7 @@ void TFGraph::Build() {
   }
 }
 
-const ShowNode* TFGraph::ShowInternal(const Options& opts) {
+const ShowNode* TFGraph::ShowInternal(const Options& opts, Timeline* timeline) {
   // Search the nodes to start from.
   std::vector<GraphNode*> roots = roots_;
   if (opts.start_name_regexes.size() != 1 ||
@@ -81,11 +81,13 @@ const ShowNode* TFGraph::ShowInternal(const Options& opts) {
   std::map<string, int64> account_visits;
   Account({root}, opts, &account_visits);
 
-  if (opts.viz) {
-    printf("Visualizing feature disabled...\n");
-  }
   std::set<string> visits;
-  return PrintGraph({root}, opts, 1, 0, 0, &visits)[0];
+  root = PrintGraph({root}, opts, 1, 0, 0, &visits)[0];
+
+  if (timeline) {
+    timeline->GenerateGraphTimeline(root);
+  }
+  return root;
 }
 
 std::vector<GraphNode*> TFGraph::SearchRoot(
@@ -155,8 +157,14 @@ std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
       show_cnodes = SortNodes(show_cnodes, opts);
       string children_str;
       for (GraphNode* sc : show_cnodes) {
-        children_str += sc->formatted_str;
-        node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
+          children_str += sc->formatted_str;
+          sc->formatted_str.clear();
+        }
+        // This swap and reinit pattern is critical for performance.
+        node->mutable_proto()->add_children()->Swap(sc->mutable_proto());
+        sc->ReInit();
+        node->show_children.push_back(sc);
         if (opts.account_displayed_op_only) {
           node->AggregateTotalStats(sc);
         }
@@ -181,7 +189,6 @@ std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
           node->formatted_str += value_str;
         }
       }
-
       node->formatted_str += children_str;
       show_nodes.push_back(node);
     } else {
diff --git a/tensorflow/tools/tfprof/internal/tfprof_graph.h b/tensorflow/tools/tfprof/internal/tfprof_graph.h
index 75979d020c..4d4aa8b2b1 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_graph.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_graph.h
@@ -37,32 +37,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tfprof {
-class GraphNode : public ShowNode {
- public:
-  explicit GraphNode(TFGraphNode* node) : ShowNode(node) {
-    mutable_proto()->set_inputs(node->inputs().size());
-    mutable_proto()->set_total_inputs(0);
-  }
-
-  void AggregateTotalStats(GraphNode* node) {
-    ShowNode::AggregateTotalStats(node);
-    mutable_proto()->set_total_inputs(proto().total_inputs() +
-                                      node->proto().total_inputs() + 1);
-  }
-
-  void AddSelfToTotalStats() {
-    ShowNode::AddSelfToTotalStats();
-    mutable_proto()->set_total_inputs(proto().total_inputs() +
-                                      proto().inputs());
-  }
-
-  void ResetTotalStats() {
-    ShowNode::ResetTotalStats();
-    mutable_proto()->set_total_inputs(0);
-  }
-
-  std::vector<GraphNode*> children;
-};
 
 // Organize tensorflow ops in a graph structure, pointing from output ops
 // to input ops.
@@ -77,7 +51,8 @@ class TFGraph : public TFShow {
   void Build() override;
 
  private:
-  const ShowNode* ShowInternal(const Options& opts) override;
+  const ShowNode* ShowInternal(const Options& opts,
+                               Timeline* timeline) override;
 
   bool ShouldShowIfExtra(ShowNode* node, const Options& opts,
                          int depth) override {
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node.cc b/tensorflow/tools/tfprof/internal/tfprof_node.cc
index 5f018addb4..74c8fcbe48 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_node.cc
@@ -20,20 +20,22 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tfprof {
+// Notes about start and end time from the NodeExecStats proto.
+// For GPU, there is no difference between op_end_rel_micros and
+// all_end_rel_micros. All are kernel times.
+// For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
+// some post-processing.
+// Here, we only consider kernel time for simplicity.
 void TFGraphNode::AddStepStat(const string& device,
                               const NodeExecStats* step_stat) {
-  if (!device.empty()) {
-    // This might override device from GraphDef.
-    device_ = device;
-  }
   step_stat_ = step_stat;
+  CHECK(step_stat_);
 
-  op_start_micros_ = step_stat_->all_start_micros();
-  if (step_stat_->op_end_rel_micros() && step_stat_->op_start_rel_micros()) {
-    op_schedule_micros_ =
-        step_stat_->op_end_rel_micros() - step_stat_->op_start_rel_micros();
-  }
-  all_spent_micros_ = step_stat_->all_end_rel_micros();
+  string dev = str_util::Lowercase(device);
+
+  devices_.insert(dev);
+  op_kernel_execs_[dev].push_back(std::make_pair(
+      step_stat_->all_start_micros(), step_stat_->op_end_rel_micros()));
 
   for (const auto& output : step_stat_->output()) {
     if (output.has_tensor_description() &&
@@ -44,9 +46,5 @@ void TFGraphNode::AddStepStat(const string& device,
     }
   }
 }
-
-void TFGraphNode::AddNodeStat(const CostGraphDef::Node* cost_node) {
-  kernel_compute_micros_ = cost_node->compute_cost();
-}
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node.h b/tensorflow/tools/tfprof/internal/tfprof_node.h
index 235904ea6c..8e57db7ba2 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_node.h
@@ -23,12 +23,12 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/tools/tfprof/tfprof_log.pb.h"
 
@@ -41,10 +41,6 @@ class TFGraphNode {
       : node_(node),
         code_(nullptr),
         step_stat_(nullptr),
-        op_start_micros_(0),
-        op_schedule_micros_(0),
-        kernel_compute_micros_(0),
-        all_spent_micros_(0),
         requested_bytes_(0),
         float_ops_(0) {
     if (!node) return;
@@ -69,38 +65,51 @@ class TFGraphNode {
       update_shape(shape_vec);
     }
     op_types_.insert(node->op());
-    device_ = node->device();
   }
 
   TFGraphNode() : TFGraphNode(nullptr) {}
 
-  void AddInput(TFGraphNode* input) { inputs_[input->name()] = input; }
+  void AddInput(TFGraphNode* input, int64 output_idx) {
+    inputs_[input->name()] = input;
+    output_idx_[input->name()] = output_idx;
+  }
 
   void AddOpType(const string& op_type) { op_types_.insert(op_type); }
 
   void AddStepStat(const string& device, const NodeExecStats* step_stat);
 
-  // Add CostGraphDef::Node.
-  void AddNodeStat(const CostGraphDef::Node* cost_node);
-
   void AddFloatOps(int64 float_ops) { float_ops_ = float_ops; }
 
   void AddCode(const CodeDef* code) { code_ = code; }
 
   const string& name() const { return node_->name(); }
   const NodeDef* node_def() { return node_; }
+
+  const NodeExecStats* step_stats() const { return step_stat_; }
+
   const std::map<string, TFGraphNode*>& inputs() const { return inputs_; }
-  int64 op_start_micros() { return op_start_micros_; }
-  // This is time spent in Op::Compute(), which is GPU kernel schedule time.
-  // Currently not used.
-  int64 op_schedule_micros() { return op_schedule_micros_; }
+  const std::map<string, int64>& output_idx() { return output_idx_; }
+
   // This is time spent in kernel execution.
-  int64 kernel_compute_micros() const { return kernel_compute_micros_; }
-  int64 all_spent_micros() { return all_spent_micros_; }
+  int64 kernel_exec_micros() const {
+    if (!step_stat_) return 0;
+    int64 total = 0;
+    for (const auto& execs : op_kernel_execs_) {
+      for (const auto& exec : execs.second) {
+        total += exec.second;
+      }
+    }
+    return total;
+  }
+  const std::map<string, std::vector<std::pair<int64, int64>>>&
+  op_kernel_execs() const {
+    return op_kernel_execs_;
+  }
+
   int64 requested_bytes() const { return requested_bytes_; }
   int64 float_ops() const { return float_ops_; }
   const CodeDef* code() { return code_; }
-  string device() const { return device_; }
+  std::set<string> devices() const { return devices_; }
   const std::set<string>& op_types() const { return op_types_; }
 
   const std::vector<int64>& shape() const { return shape_; }
@@ -109,17 +118,19 @@ class TFGraphNode {
   void update_shape(const std::vector<int64>& shape) { shape_ = shape; }
 
   std::map<string, TFGraphNode*> inputs_;
+  std::map<string, int64> output_idx_;
+
   const NodeDef* node_;
   const CodeDef* code_;
   const NodeExecStats* step_stat_;
 
   std::vector<int64> shape_;
   std::set<string> op_types_;
-  string device_;
-  int64 op_start_micros_;
-  int64 op_schedule_micros_;
-  int64 kernel_compute_micros_;
-  int64 all_spent_micros_;
+
+  // device -> vector of {op_start_micros, op_kernel_exec_micros} pairs.
+  std::map<string, std::vector<std::pair<int64, int64>>> op_kernel_execs_;
+
+  std::set<string> devices_;
   int64 requested_bytes_;
   int64 float_ops_;
 };
@@ -128,7 +139,7 @@ class TFCodeNode {
  public:
   TFCodeNode(const string& trace)
       : trace_(trace),
-        kernel_compute_micros_(0),
+        kernel_exec_micros_(0),
         requested_bytes_(0),
         float_ops_(0) {}
 
@@ -138,16 +149,15 @@ class TFCodeNode {
     }
     nodes_[node->name()] = node;
 
-    kernel_compute_micros_ += node->kernel_compute_micros();
+    kernel_exec_micros_ += node->kernel_exec_micros();
     requested_bytes_ += node->requested_bytes();
     float_ops_ += node->float_ops();
     op_types_.insert(node->op_types().begin(), node->op_types().end());
     if (node->shape().size() > 0) {
       shapes_.push_back(node->shape());
     }
-    if (!node->device().empty()) {
-      devices_.insert(node->device());
-    }
+    std::set<string> devices = node->devices();
+    devices_.insert(devices.begin(), devices.end());
   }
   const std::map<string, const TFGraphNode*>& graph_nodes() const {
     return nodes_;
@@ -165,7 +175,7 @@ class TFCodeNode {
 
   const string& name() const { return trace_; }
 
-  int64 kernel_compute_micros() const { return kernel_compute_micros_; }
+  int64 kernel_exec_micros() const { return kernel_exec_micros_; }
 
   int64 requested_bytes() const { return requested_bytes_; }
 
@@ -180,7 +190,7 @@ class TFCodeNode {
  private:
   const string trace_;
   std::set<string> op_types_;
-  int64 kernel_compute_micros_;
+  int64 kernel_exec_micros_;
   int64 requested_bytes_;
   int64 float_ops_;
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node_show.cc b/tensorflow/tools/tfprof/internal/tfprof_node_show.cc
new file mode 100644
index 0000000000..2b5390676d
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_node_show.cc
@@ -0,0 +1,296 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace tfprof {
+ShowNode::ShowNode(const TFGraphNode* node) : node(node), account(true) {
+  ReInit();
+}
+
+void ShowNode::ReInit() {
+  mutable_proto()->set_name(name());
+  for (const string& device : node->devices()) {
+    *mutable_proto()->mutable_devices()->Add() = device;
+  }
+  mutable_proto()->set_exec_micros(node->kernel_exec_micros());
+  mutable_proto()->set_requested_bytes(node->requested_bytes());
+  mutable_proto()->set_float_ops(node->float_ops());
+
+  if (!node->shape().empty()) {
+    int64 params = 1;
+    bool complete_shape = true;
+    for (int64 d : node->shape()) {
+      // Sometimes parameters could be <0 when a dim is unknown.
+      if (d < 0) {
+        complete_shape = false;
+        break;
+      }
+      params *= d;
+    }
+    if (complete_shape) {
+      mutable_proto()->set_parameters(proto_.parameters() + params);
+    } else {
+      fprintf(stderr, "Incomplete shape.");
+    }
+  }
+}
+
+string ShowNode::Format(const Options& opts) {
+  if (opts.select.empty()) {
+    return name();
+  }
+  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
+}
+
+string ShowNode::FormatMeta(const Options& opts) {
+  std::vector<string> info;
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    const string shape = FormatShapes(node->shape());
+    if (!shape.empty()) {
+      info.push_back(shape);
+    }
+    string params = FormatNumber(proto().total_parameters()) + " params";
+    if (account) {
+      params = FormatNumber(proto().parameters()) + "/" + params;
+    } else {
+      params = "--/" + params;
+    }
+    info.push_back(params);
+  }
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    string fops = FormatNumber(proto().total_float_ops()) + " flops";
+    if (account) {
+      fops = FormatNumber(proto().float_ops()) + "/" + fops;
+    } else {
+      fops = "--/" + fops;
+    }
+    info.push_back(fops);
+  }
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    string memory = FormatMemory(proto().total_requested_bytes());
+    if (account) {
+      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
+
+    } else {
+      memory = "--/" + memory;
+    }
+    info.push_back(memory);
+  }
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    string time = FormatTime(proto().total_exec_micros());
+    if (account) {
+      time = FormatTime(proto().exec_micros()) + "/" + time;
+    } else {
+      time = "--/" + time;
+    }
+    info.push_back(time);
+  }
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    if (proto().devices_size() > 0) {
+      info.push_back(str_util::Join(proto().devices(), "|"));
+    }
+  }
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    std::set<string> op_types = node->op_types();
+    // Device is considered a type.
+    if (proto().devices_size() > 0) {
+      op_types.insert(str_util::Join(proto().devices(), "|"));
+    }
+    info.push_back(str_util::Join(op_types, "|"));
+  }
+  return str_util::Join(info, ", ");
+}
+
+TFGraphNodeProto* ShowNode::mutable_proto() { return &proto_; }
+
+const TFGraphNodeProto& ShowNode::proto() const { return proto_; }
+
+void ShowNode::AggregateTotalStats(ShowNode* node) {
+  TFGraphNodeProto* node_pb = node->mutable_proto();
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         node_pb->total_exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             node_pb->total_requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        node_pb->total_parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       node_pb->total_float_ops());
+}
+
+void ShowNode::AddSelfToTotalStats() {
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         proto().exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             proto().requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        proto().parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       proto().float_ops());
+}
+
+void ShowNode::ResetTotalStats() {
+  mutable_proto()->set_total_exec_micros(0);
+  mutable_proto()->set_total_requested_bytes(0);
+  mutable_proto()->set_total_parameters(0);
+  mutable_proto()->set_total_float_ops(0);
+  mutable_proto()->mutable_children()->Clear();
+}
+
+ShowCodeNode::ShowCodeNode(const TFCodeNode* node) : node(node), account(true) {
+  std::vector<ScopeNode> snodes;
+  for (auto it : node->graph_nodes()) {
+    ScopeNode snode(it.second);
+    snodes.push_back(snode);
+    snodes[snodes.size() - 1].AddSelfToTotalStats();
+    *mutable_proto()->mutable_graph_nodes()->Add() =
+        snodes[snodes.size() - 1].proto();
+  }
+
+  mutable_proto()->set_name(name());
+  mutable_proto()->set_exec_micros(node->kernel_exec_micros());
+  mutable_proto()->set_requested_bytes(node->requested_bytes());
+  mutable_proto()->set_float_ops(node->float_ops());
+
+  if (!node->shapes().empty()) {
+    for (const std::vector<int64>& shape : node->shapes()) {
+      int64 params = 1;
+      bool complete_shape = true;
+      for (int64 d : shape) {
+        // Sometimes parameters could be <0 when a dim is unknown.
+        if (d < 0) {
+          complete_shape = false;
+          break;
+        }
+        params *= d;
+      }
+      if (complete_shape) {
+        mutable_proto()->set_parameters(proto().parameters() + params);
+      } else {
+        fprintf(stderr, "Incomplete shape.");
+      }
+    }
+  }
+}
+
+string ShowCodeNode::Format(const Options& opts) {
+  if (opts.select.empty()) {
+    return name();
+  }
+  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
+}
+
+string ShowCodeNode::FormatMeta(const Options& opts) {
+  std::vector<string> info;
+  std::vector<string> shapes;
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    for (const std::vector<int64>& shape : node->shapes()) {
+      if (!shape.empty()) {
+        shapes.push_back(FormatShapes(shape));
+      }
+    }
+    if (!shapes.empty()) {
+      info.push_back(str_util::Join(shapes, "|"));
+    }
+    string params = FormatNumber(proto().total_parameters()) + " params";
+    if (account) {
+      params = FormatNumber(proto().parameters()) + "/" + params;
+    } else {
+      params = "--/" + params;
+    }
+    info.push_back(params);
+  }
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    string fops = FormatNumber(proto().total_float_ops()) + " flops";
+    if (account) {
+      fops = FormatNumber(proto().float_ops()) + "/" + fops;
+    } else {
+      fops = "--/" + fops;
+    }
+    info.push_back(fops);
+  }
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    string memory = FormatMemory(proto().total_requested_bytes());
+    if (account) {
+      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
+
+    } else {
+      memory = "--/" + memory;
+    }
+    info.push_back(memory);
+  }
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    string time = FormatTime(proto().total_exec_micros());
+    if (account) {
+      time = FormatTime(proto().exec_micros()) + "/" + time;
+    } else {
+      time = "--/" + time;
+    }
+    info.push_back(time);
+  }
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    if (!node->devices().empty()) {
+      info.push_back(str_util::Join(node->devices(), "|"));
+    }
+  }
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    std::set<string> op_types = node->op_types();
+    // Device is considered a type.
+    op_types.insert(node->devices().cbegin(), node->devices().cend());
+    info.push_back(str_util::Join(op_types, "|"));
+  }
+  return str_util::Join(info, ", ");
+}
+
+TFCodeNodeProto* ShowCodeNode::mutable_proto() { return &proto_; }
+
+const TFCodeNodeProto& ShowCodeNode::proto() const { return proto_; }
+
+void ShowCodeNode::AggregateTotalStats(ShowCodeNode* node) {
+  TFCodeNodeProto* node_pb = node->mutable_proto();
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         node_pb->total_exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             node_pb->total_requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        node_pb->total_parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       node_pb->total_float_ops());
+}
+
+void ShowCodeNode::AddSelfToTotalStats() {
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         proto().exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             proto().requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        proto().parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       proto().float_ops());
+}
+
+void ShowCodeNode::ResetTotalStats() {
+  mutable_proto()->set_total_exec_micros(0);
+  mutable_proto()->set_total_requested_bytes(0);
+  mutable_proto()->set_total_parameters(0);
+  mutable_proto()->set_total_float_ops(0);
+  mutable_proto()->mutable_children()->Clear();
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node_show.h b/tensorflow/tools/tfprof/internal/tfprof_node_show.h
new file mode 100644
index 0000000000..4ce0f63f9b
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_node_show.h
@@ -0,0 +1,173 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Nodes used for different views.
+// ScopeNode is for scope view. GraphNode is for graph view and CodeNode
+// is for code view.
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class ShowNode {
+ public:
+  explicit ShowNode(const TFGraphNode* node);
+  virtual ~ShowNode() {}
+
+  const string& name() const { return node->name(); }
+  TFGraphNodeProto* mutable_proto();
+  const TFGraphNodeProto& proto() const;
+
+  void ReInit();
+
+  string Format(const Options& opts);
+
+  string FormatMeta(const Options& opts);
+
+  const TFGraphNode* node;
+  bool account;
+  string formatted_str;
+
+ protected:
+  void AggregateTotalStats(ShowNode* node);
+
+  void AddSelfToTotalStats();
+
+  void ResetTotalStats();
+
+  TFGraphNodeProto proto_;
+};
+
+class GraphNode : public ShowNode {
+ public:
+  explicit GraphNode(TFGraphNode* node) : ShowNode(node) {
+    mutable_proto()->set_inputs(node->inputs().size());
+    mutable_proto()->set_total_inputs(0);
+  }
+
+  void ReInit() {
+    ShowNode::ReInit();
+    mutable_proto()->set_inputs(node->inputs().size());
+    mutable_proto()->set_total_inputs(0);
+  }
+
+  void AggregateTotalStats(GraphNode* node) {
+    ShowNode::AggregateTotalStats(node);
+    mutable_proto()->set_total_inputs(proto().total_inputs() +
+                                      node->proto().total_inputs() + 1);
+  }
+
+  void AddSelfToTotalStats() {
+    ShowNode::AddSelfToTotalStats();
+    mutable_proto()->set_total_inputs(proto().total_inputs() +
+                                      proto().inputs());
+  }
+
+  void ResetTotalStats() {
+    ShowNode::ResetTotalStats();
+    mutable_proto()->set_total_inputs(0);
+    show_children.clear();
+  }
+
+  std::vector<GraphNode*> children;
+  std::vector<GraphNode*> show_children;
+};
+
+class ScopeNode : public ShowNode {
+ public:
+  explicit ScopeNode(const TFGraphNode* node) : ShowNode(node) {}
+  ~ScopeNode() override {}
+
+  void ReInit() { ShowNode::ReInit(); }
+
+  void AggregateTotalStats(ScopeNode* node) {
+    ShowNode::AggregateTotalStats(node);
+  }
+
+  void AddSelfToTotalStats() { ShowNode::AddSelfToTotalStats(); }
+
+  void ResetTotalStats() {
+    ShowNode::ResetTotalStats();
+    show_children.clear();
+  }
+
+  std::vector<ScopeNode*> children;
+  std::vector<ScopeNode*> show_children;
+};
+
+class ShowCodeNode {
+ public:
+  explicit ShowCodeNode(const TFCodeNode* node);
+  virtual ~ShowCodeNode() {}
+
+  const string& name() const { return node->name(); }
+  TFCodeNodeProto* mutable_proto();
+  const TFCodeNodeProto& proto() const;
+
+  string Format(const Options& opts);
+
+  string FormatMeta(const Options& opts);
+
+  const TFCodeNode* node;
+  bool account;
+  string formatted_str;
+
+ protected:
+  void AggregateTotalStats(ShowCodeNode* node);
+
+  void AddSelfToTotalStats();
+
+  void ResetTotalStats();
+
+  TFCodeNodeProto proto_;
+};
+
+class CodeNode : public ShowCodeNode {
+ public:
+  explicit CodeNode(const TFCodeNode* node) : ShowCodeNode(node) {}
+  ~CodeNode() override {}
+
+  void AggregateTotalStats(CodeNode* node) {
+    ShowCodeNode::AggregateTotalStats(node);
+  }
+
+  void AddSelfToTotalStats() { ShowCodeNode::AddSelfToTotalStats(); }
+
+  void ResetTotalStats() {
+    ShowCodeNode::ResetTotalStats();
+    show_children.clear();
+  }
+
+  std::vector<CodeNode*> children;
+  std::vector<CodeNode*> show_children;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_options.cc b/tensorflow/tools/tfprof/internal/tfprof_options.cc
index 03282533ff..f592a4cf8c 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_options.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_options.cc
@@ -17,16 +17,133 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/tools/tfprof/tfprof_options.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
+namespace {
+string KeyValueToStr(const std::map<string, string>& kv_map) {
+  std::vector<string> kv_vec;
+  kv_vec.reserve(kv_map.size());
+  for (const auto& pair : kv_map) {
+    kv_vec.push_back(strings::StrCat(pair.first, "=", pair.second));
+  }
+  return str_util::Join(kv_vec, ",");
+}
+}  // namespace
+
+tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
+                               std::map<string, string>* output_options) {
+  // The default is to use stdout.
+  if (output_opt.empty()) {
+    *output_type = kOutput[1];
+    return tensorflow::Status::OK();
+  }
+
+  std::set<string> output_types(kOutput,
+                                kOutput + sizeof(kOutput) / sizeof(*kOutput));
+  auto opt_split = output_opt.find(":");
+  std::vector<string> kv_split;
+  if (opt_split == output_opt.npos) {
+    if (output_types.find(output_opt) == output_types.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("E.g. Unknown output type: %s, Valid types: %s\n",
+                          output_opt.c_str(),
+                          str_util::Join(output_types, ",").c_str()));
+    }
+    *output_type = output_opt;
+  } else {
+    *output_type = output_opt.substr(0, opt_split);
+    if (output_types.find(*output_type) == output_types.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("E.g. Unknown output type: %s, Valid types: %s\n",
+                          output_type->c_str(),
+                          str_util::Join(output_types, ",").c_str()));
+    }
+    kv_split = str_util::Split(output_opt.substr(opt_split + 1), ",",
+                               str_util::SkipEmpty());
+  }
 
-Options Options::FromProtoStr(const string& opts_proto_str) {
+  std::set<string> valid_options;
+  std::set<string> required_options;
+  if (*output_type == kOutput[0]) {
+    valid_options.insert(
+        kTimelineOpts,
+        kTimelineOpts + sizeof(kTimelineOpts) / sizeof(*kTimelineOpts));
+    required_options.insert(
+        kTimelineRequiredOpts,
+        kTimelineRequiredOpts +
+            sizeof(kTimelineRequiredOpts) / sizeof(*kTimelineRequiredOpts));
+  } else if (*output_type == kOutput[2]) {
+    valid_options.insert(kFileOpts,
+                         kFileOpts + sizeof(kFileOpts) / sizeof(*kFileOpts));
+    required_options.insert(kFileRequiredOpts,
+                            kFileRequiredOpts + sizeof(kFileRequiredOpts) /
+                                                    sizeof(*kFileRequiredOpts));
+  }
+
+  for (const string& kv_str : kv_split) {
+    const std::vector<string> kv =
+        str_util::Split(kv_str, "=", str_util::SkipEmpty());
+    if (kv.size() != 2) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          "Visualize format: -output timeline:key=value,key=value,...");
+    }
+    if (valid_options.find(kv[0]) == valid_options.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("Unrecognized options %s for output_type: %s\n",
+                          kv[0].c_str(), output_type->c_str()));
+    }
+    (*output_options)[kv[0]] = kv[1];
+  }
+
+  for (const string& opt : required_options) {
+    if (output_options->find(opt) == output_options->end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("Missing required output_options for %s\n"
+                          "E.g. -output %s:%s=...\n",
+                          output_type->c_str(), output_type->c_str(),
+                          opt.c_str()));
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status Options::FromProtoStr(const string& opts_proto_str,
+                                         Options* opts) {
   OptionsProto opts_pb;
-  CHECK(opts_pb.ParseFromString(opts_proto_str));
-  Options opts(
+  if (!opts_pb.ParseFromString(opts_proto_str)) {
+    return tensorflow::Status(
+        tensorflow::error::INTERNAL,
+        strings::StrCat("Failed to parse option string from Python API: ",
+                        opts_proto_str));
+  }
+
+  string output_type;
+  std::map<string, string> output_options;
+  tensorflow::Status s =
+      ParseOutput(opts_pb.output(), &output_type, &output_options);
+  if (!s.ok()) return s;
+
+  if (!opts_pb.dump_to_file().empty()) {
+    fprintf(stderr,
+            "-dump_to_file option is deprecated. "
+            "Please use -output file:outfile=<filename>\n");
+    fprintf(stderr, "-output %s is overwritten with -output file:outfile=%s\n",
+            opts_pb.output().c_str(), opts_pb.dump_to_file().c_str());
+    output_type = kOutput[2];
+    output_options.clear();
+    output_options[kFileOpts[0]] = opts_pb.dump_to_file();
+  }
+
+  *opts = Options(
       opts_pb.max_depth(), opts_pb.min_bytes(), opts_pb.min_micros(),
       opts_pb.min_params(), opts_pb.min_float_ops(),
       std::vector<string>(opts_pb.device_regexes().begin(),
@@ -44,8 +161,8 @@ Options Options::FromProtoStr(const string& opts_proto_str) {
                           opts_pb.hide_name_regexes().end()),
       opts_pb.account_displayed_op_only(),
       std::vector<string>(opts_pb.select().begin(), opts_pb.select().end()),
-      opts_pb.viz(), opts_pb.dump_to_file());
-  return opts;
+      output_type, output_options);
+  return tensorflow::Status::OK();
 }
 
 string Options::ToString() const {
@@ -64,8 +181,7 @@ string Options::ToString() const {
       "%-28s%s\n"
       "%-28s%s\n"
       "%-28s%s\n"
-      "%-28s%s\n"
-      "%-28s%s\n",
+      "%-28s%s:%s\n",
       kOptions[0], max_depth, kOptions[1], min_bytes, kOptions[2], min_micros,
       kOptions[3], min_params, kOptions[4], min_float_ops, kOptions[5],
       str_util::Join(device_regexes, ",").c_str(), kOptions[6],
@@ -76,8 +192,8 @@ string Options::ToString() const {
       str_util::Join(show_name_regexes, ",").c_str(), kOptions[11],
       str_util::Join(hide_name_regexes, ",").c_str(), kOptions[12],
       (account_displayed_op_only ? "true" : "false"), kOptions[13],
-      str_util::Join(select, ",").c_str(), kOptions[14],
-      (viz ? "true" : "false"), kOptions[15], dump_to_file.c_str());
+      str_util::Join(select, ",").c_str(), kOptions[14], output_type.c_str(),
+      KeyValueToStr(output_options).c_str());
   return s;
 }
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_options.h b/tensorflow/tools/tfprof/internal/tfprof_options.h
index 0a9f2768e0..cf48b4de81 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_options.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_options.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -40,8 +41,7 @@ static const char* const kOptions[] = {
     "-hide_name_regexes",
     "-account_displayed_op_only",
     "-select",
-    "-viz",
-    "-dump_to_file",
+    "-output",
 };
 
 static const char* const kOrderBy[] = {
@@ -58,11 +58,30 @@ static const char* const kCmds[] = {
     "scope", "graph", "code", "set", "help",
 };
 
+static const char* const kOutput[] = {"timeline", "stdout", "file"};
+
+static const char* const kTimelineOpts[] = {
+    "outfile",
+};
+
+static const char* const kTimelineRequiredOpts[] = {"outfile"};
+
+static const char* const kFileOpts[] = {
+    "outfile",
+};
+
+static const char* const kFileRequiredOpts[] = {
+    "outfile",
+};
+
 struct Options {
  public:
-  static Options FromProtoStr(const string& opts_proto_str);
+  static tensorflow::Status FromProtoStr(const string& opts_proto_str,
+                                         Options* opts);
 
   virtual ~Options() {}
+  Options()
+      : Options(0, 0, 0, 0, 0, {}, "", {}, {}, {}, {}, {}, false, {}, "", {}) {}
   Options(int max_depth, tensorflow::int64 min_bytes,
           tensorflow::int64 min_micros, tensorflow::int64 min_params,
           tensorflow::int64 min_float_ops,
@@ -73,7 +92,8 @@ struct Options {
           const std::vector<string>& show_name_regexes,
           const std::vector<string>& hide_name_regexes,
           bool account_displayed_op_only, const std::vector<string>& select,
-          bool viz, const string& dump_to_file = "")
+          const string& output_type,
+          const std::map<string, string>& output_options)
       : max_depth(max_depth),
         min_bytes(min_bytes),
         min_micros(min_micros),
@@ -88,8 +108,8 @@ struct Options {
         hide_name_regexes(hide_name_regexes),
         account_displayed_op_only(account_displayed_op_only),
         select(select.begin(), select.end()),
-        viz(viz),
-        dump_to_file(dump_to_file) {}
+        output_type(output_type),
+        output_options(output_options) {}
 
   string ToString() const;
 
@@ -109,10 +129,17 @@ struct Options {
   bool account_displayed_op_only;
 
   std::set<string> select;
-  bool viz;
-  string dump_to_file;
+
+  string output_type;
+  std::map<string, string> output_options;
 };
 
+// Parse the -output option.
+// 'output_opt': User input string with format: output_type:key=value,key=value.
+// 'output_type' and 'output_options' are extracted from 'output_opt'.
+tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
+                               std::map<string, string>* output_options);
+
 }  // namespace tfprof
 }  // namespace tensorflow
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_scope.cc b/tensorflow/tools/tfprof/internal/tfprof_scope.cc
index b4aef717c8..fe525c4bd8 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_scope.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_scope.cc
@@ -72,7 +72,7 @@ void TFScope::Build() {
   }
 }
 
-const ShowNode* TFScope::ShowInternal(const Options& opts) {
+const ShowNode* TFScope::ShowInternal(const Options& opts, Timeline* timeline) {
   // Search from roots recursively to find start node, if start_name_regexes
   // is specified.
   std::vector<ScopeNode*> roots = roots_;
@@ -86,6 +86,9 @@ const ShowNode* TFScope::ShowInternal(const Options& opts) {
   Account({root}, opts);
 
   root = PrintScope({root}, opts, 1, 0)[0];
+  if (timeline) {
+    timeline->GenerateScopeTimeline(root);
+  }
   return root;
 }
 
@@ -139,8 +142,13 @@ std::vector<ScopeNode*> TFScope::PrintScope(const std::vector<ScopeNode*> roots,
       show_cnodes = SortNodes(show_cnodes, opts);
       string children_str;
       for (ScopeNode* sc : show_cnodes) {
-        children_str += sc->formatted_str;
+        if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
+          children_str += sc->formatted_str;
+          sc->formatted_str.clear();
+        }
         node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        sc->mutable_proto()->mutable_children()->Clear();
+        node->show_children.push_back(sc);
         if (opts.account_displayed_op_only) {
           node->AggregateTotalStats(sc);
         }
diff --git a/tensorflow/tools/tfprof/internal/tfprof_scope.h b/tensorflow/tools/tfprof/internal/tfprof_scope.h
index 2e2e4f5266..7bdcc794cd 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_scope.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_scope.h
@@ -37,22 +37,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 
-class ScopeNode : public ShowNode {
- public:
-  explicit ScopeNode(const TFGraphNode* node) : ShowNode(node) {}
-  ~ScopeNode() override {}
-
-  void AggregateTotalStats(ScopeNode* node) {
-    ShowNode::AggregateTotalStats(node);
-  }
-
-  void AddSelfToTotalStats() { ShowNode::AddSelfToTotalStats(); }
-
-  void ResetTotalStats() { ShowNode::ResetTotalStats(); }
-
-  std::vector<ScopeNode*> children;
-};
-
 class TFScope : public TFShow {
  public:
   explicit TFScope(checkpoint::CheckpointReader* ckpt_reader)
@@ -64,7 +48,8 @@ class TFScope : public TFShow {
   void Build() override;
 
  private:
-  const ShowNode* ShowInternal(const Options& opts) override;
+  const ShowNode* ShowInternal(const Options& opts,
+                               Timeline* timeline) override;
 
   ScopeNode* CreateParentNode(const string& name);
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.cc b/tensorflow/tools/tfprof/internal/tfprof_show.cc
index 932dfb3893..b96db5468e 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.cc
@@ -18,155 +18,32 @@ limitations under the License.
 #include <memory>
 #include <set>
 
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace tensorflow {
 namespace tfprof {
-ShowNode::ShowNode(const TFGraphNode* node) : node(node), account(true) {
-  mutable_proto()->set_name(name());
-  if (!node->device().empty()) {
-    mutable_proto()->set_device(node->device());
-  }
-  mutable_proto()->set_exec_micros(node->kernel_compute_micros());
-  mutable_proto()->set_requested_bytes(node->requested_bytes());
-  mutable_proto()->set_float_ops(node->float_ops());
-
-  if (!node->shape().empty()) {
-    int64 params = 1;
-    bool complete_shape = true;
-    for (int64 d : node->shape()) {
-      // Sometimes parameters could be <0 when a dim is unknown.
-      if (d < 0) {
-        complete_shape = false;
-        break;
-      }
-      params *= d;
-    }
-    if (complete_shape) {
-      mutable_proto()->set_parameters(proto_.parameters() + params);
-    } else {
-      fprintf(stderr, "Incomplete shape.");
-    }
-  }
-}
-
-string ShowNode::Format(const Options& opts) {
-  if (opts.select.empty()) {
-    return name();
-  }
-  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
-}
-
-string ShowNode::FormatMeta(const Options& opts) {
-  std::vector<string> info;
-  if (opts.select.find(kShown[2]) != opts.select.end()) {
-    const string shape = FormatShapes(node->shape());
-    if (!shape.empty()) {
-      info.push_back(shape);
-    }
-    string params = FormatNumber(proto().total_parameters()) + " params";
-    if (account) {
-      params = FormatNumber(proto().parameters()) + "/" + params;
-    } else {
-      params = "--/" + params;
-    }
-    info.push_back(params);
-  }
-  if (opts.select.find(kShown[3]) != opts.select.end()) {
-    string fops = FormatNumber(proto().total_float_ops()) + " flops";
-    if (account) {
-      fops = FormatNumber(proto().float_ops()) + "/" + fops;
-    } else {
-      fops = "--/" + fops;
-    }
-    info.push_back(fops);
-  }
-  if (opts.select.find(kShown[0]) != opts.select.end()) {
-    string memory = FormatMemory(proto().total_requested_bytes());
-    if (account) {
-      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
-
-    } else {
-      memory = "--/" + memory;
-    }
-    info.push_back(memory);
-  }
-  if (opts.select.find(kShown[1]) != opts.select.end()) {
-    string time = FormatTime(proto().total_exec_micros());
-    if (account) {
-      time = FormatTime(proto().exec_micros()) + "/" + time;
-    } else {
-      time = "--/" + time;
-    }
-    info.push_back(time);
-  }
-  if (opts.select.find(kShown[6]) != opts.select.end()) {
-    if (!proto().device().empty()) {
-      info.push_back(proto().device());
-    }
-  }
-  if (opts.select.find(kShown[7]) != opts.select.end()) {
-    std::set<string> op_types = node->op_types();
-    // Device is considered a type.
-    if (!proto().device().empty()) {
-      op_types.insert(proto().device());
-    }
-    info.push_back(str_util::Join(op_types, "|"));
-  }
-  return str_util::Join(info, ", ");
-}
-
-TFGraphNodeProto* ShowNode::mutable_proto() { return &proto_; }
-
-const TFGraphNodeProto& ShowNode::proto() const { return proto_; }
-
-void ShowNode::AggregateTotalStats(ShowNode* node) {
-  TFGraphNodeProto* node_pb = node->mutable_proto();
-  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
-                                         node_pb->total_exec_micros());
-  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
-                                             node_pb->total_requested_bytes());
-  mutable_proto()->set_total_parameters(proto().total_parameters() +
-                                        node_pb->total_parameters());
-  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
-                                       node_pb->total_float_ops());
-}
-
-void ShowNode::AddSelfToTotalStats() {
-  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
-                                         proto().exec_micros());
-  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
-                                             proto().requested_bytes());
-  mutable_proto()->set_total_parameters(proto().total_parameters() +
-                                        proto().parameters());
-  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
-                                       proto().float_ops());
-}
-
-void ShowNode::ResetTotalStats() {
-  mutable_proto()->set_total_exec_micros(0);
-  mutable_proto()->set_total_requested_bytes(0);
-  mutable_proto()->set_total_parameters(0);
-  mutable_proto()->set_total_float_ops(0);
-  mutable_proto()->mutable_children()->Clear();
-}
 
 const TFGraphNodeProto& TFShow::Show(const Options& opts) {
-  const ShowNode* root = ShowInternal(opts);
-  if (opts.dump_to_file.empty()) {
-    printf("%s", root->formatted_str.c_str());
-    fflush(stdout);
-  } else {
-    Status s = WriteStringToFile(Env::Default(), opts.dump_to_file,
-                                 root->formatted_str);
+  if (opts.output_type == kOutput[0]) {
+    Timeline timeline(opts.output_options.at(kTimelineOpts[0]));
+    return ShowInternal(opts, &timeline)->proto();
+  } else if (opts.output_type == kOutput[2]) {
+    const ShowNode* root = ShowInternal(opts, nullptr);
+    Status s =
+        WriteStringToFile(Env::Default(), opts.output_options.at(kFileOpts[0]),
+                          root->formatted_str);
     if (!s.ok()) {
       fprintf(stderr, "%s\n", s.ToString().c_str());
     }
+    return root->proto();
+  } else {
+    const ShowNode* root = ShowInternal(opts, nullptr);
+    printf("%s", root->formatted_str.c_str());
+    fflush(stdout);
+    return root->proto();
   }
-  return root->proto();
 }
 
 bool TFShow::LookUpCheckPoint(const string& name,
@@ -206,10 +83,13 @@ bool TFShow::ShouldShow(ShowNode* node, const Options& opts, int depth) {
     show = true;
   } else {
     for (const string& regex : opts.device_regexes) {
-      if (RE2::FullMatch(node->proto().device(), regex)) {
-        show = true;
-        break;
+      for (const string& device : node->proto().devices()) {
+        if (RE2::FullMatch(device, regex)) {
+          show = true;
+          break;
+        }
       }
+      if (show) break;
     }
   }
   // Don't show if device_regexes don't cover it.
@@ -255,11 +135,11 @@ bool TFShow::ShouldAccount(ShowNode* node, const Options& opts) {
         return true;
       }
     }
-    if (RE2::FullMatch(node->proto().device(), regex)) {
-      return true;
-    }
+    for (const string& device : node->proto().devices())
+      if (RE2::FullMatch(device, regex)) {
+        return true;
+      }
   }
-
   return false;
 }
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.h b/tensorflow/tools/tfprof/internal/tfprof_show.h
index 5e85b81a72..803b301044 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.h
@@ -28,40 +28,15 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
 #include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
-class ShowNode {
- public:
-  explicit ShowNode(const TFGraphNode* node);
-  virtual ~ShowNode() {}
-
-  const string& name() const { return node->name(); }
-  TFGraphNodeProto* mutable_proto();
-  const TFGraphNodeProto& proto() const;
-
-  string Format(const Options& opts);
-
-  string FormatMeta(const Options& opts);
-
-  const TFGraphNode* node;
-  bool account;
-  string formatted_str;
-
- protected:
-  void AggregateTotalStats(ShowNode* node);
-
-  void AddSelfToTotalStats();
-
-  void ResetTotalStats();
-
-  TFGraphNodeProto proto_;
-};
-
 class TFShow {
  public:
   explicit TFShow(checkpoint::CheckpointReader* ckpt_reader)
@@ -72,7 +47,8 @@ class TFShow {
   const TFGraphNodeProto& Show(const Options& opts);
 
  protected:
-  virtual const ShowNode* ShowInternal(const Options& opts) = 0;
+  virtual const ShowNode* ShowInternal(const Options& opts,
+                                       Timeline* timeline) = 0;
 
   bool LookUpCheckPoint(const string& name,
                         std::unique_ptr<TFProfTensor>* tensor);
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_code.cc b/tensorflow/tools/tfprof/internal/tfprof_show_code.cc
index 17c0073c7e..cfec09ad19 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show_code.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_code.cc
@@ -26,159 +26,26 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tfprof {
-ShowCodeNode::ShowCodeNode(const TFCodeNode* node) : node(node), account(true) {
-  std::vector<ScopeNode> snodes;
-  for (auto it : node->graph_nodes()) {
-    ScopeNode snode(it.second);
-    snodes.push_back(snode);
-    snodes[snodes.size() - 1].AddSelfToTotalStats();
-    *mutable_proto()->mutable_graph_nodes()->Add() =
-        snodes[snodes.size() - 1].proto();
-  }
-
-  mutable_proto()->set_name(name());
-  mutable_proto()->set_exec_micros(node->kernel_compute_micros());
-  mutable_proto()->set_requested_bytes(node->requested_bytes());
-  mutable_proto()->set_float_ops(node->float_ops());
-
-  if (!node->shapes().empty()) {
-    for (const std::vector<int64>& shape : node->shapes()) {
-      int64 params = 1;
-      bool complete_shape = true;
-      for (int64 d : shape) {
-        // Sometimes parameters could be <0 when a dim is unknown.
-        if (d < 0) {
-          complete_shape = false;
-          break;
-        }
-        params *= d;
-      }
-      if (complete_shape) {
-        mutable_proto()->set_parameters(proto().parameters() + params);
-      } else {
-        fprintf(stderr, "Incomplete shape.");
-      }
-    }
-  }
-}
-
-string ShowCodeNode::Format(const Options& opts) {
-  if (opts.select.empty()) {
-    return name();
-  }
-  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
-}
-
-string ShowCodeNode::FormatMeta(const Options& opts) {
-  std::vector<string> info;
-  std::vector<string> shapes;
-  if (opts.select.find(kShown[2]) != opts.select.end()) {
-    for (const std::vector<int64>& shape : node->shapes()) {
-      if (!shape.empty()) {
-        shapes.push_back(FormatShapes(shape));
-      }
-    }
-    if (!shapes.empty()) {
-      info.push_back(str_util::Join(shapes, "|"));
-    }
-    string params = FormatNumber(proto().total_parameters()) + " params";
-    if (account) {
-      params = FormatNumber(proto().parameters()) + "/" + params;
-    } else {
-      params = "--/" + params;
-    }
-    info.push_back(params);
-  }
-  if (opts.select.find(kShown[3]) != opts.select.end()) {
-    string fops = FormatNumber(proto().total_float_ops()) + " flops";
-    if (account) {
-      fops = FormatNumber(proto().float_ops()) + "/" + fops;
-    } else {
-      fops = "--/" + fops;
-    }
-    info.push_back(fops);
-  }
-  if (opts.select.find(kShown[0]) != opts.select.end()) {
-    string memory = FormatMemory(proto().total_requested_bytes());
-    if (account) {
-      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
-
-    } else {
-      memory = "--/" + memory;
-    }
-    info.push_back(memory);
-  }
-  if (opts.select.find(kShown[1]) != opts.select.end()) {
-    string time = FormatTime(proto().total_exec_micros());
-    if (account) {
-      time = FormatTime(proto().exec_micros()) + "/" + time;
-    } else {
-      time = "--/" + time;
-    }
-    info.push_back(time);
-  }
-  if (opts.select.find(kShown[6]) != opts.select.end()) {
-    if (!node->devices().empty()) {
-      info.push_back(str_util::Join(node->devices(), "|"));
-    }
-  }
-  if (opts.select.find(kShown[7]) != opts.select.end()) {
-    std::set<string> op_types = node->op_types();
-    // Device is considered a type.
-    op_types.insert(node->devices().cbegin(), node->devices().cend());
-    info.push_back(str_util::Join(op_types, "|"));
-  }
-  return str_util::Join(info, ", ");
-}
-
-TFCodeNodeProto* ShowCodeNode::mutable_proto() { return &proto_; }
-
-const TFCodeNodeProto& ShowCodeNode::proto() const { return proto_; }
-
-void ShowCodeNode::AggregateTotalStats(ShowCodeNode* node) {
-  TFCodeNodeProto* node_pb = node->mutable_proto();
-  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
-                                         node_pb->total_exec_micros());
-  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
-                                             node_pb->total_requested_bytes());
-  mutable_proto()->set_total_parameters(proto().total_parameters() +
-                                        node_pb->total_parameters());
-  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
-                                       node_pb->total_float_ops());
-}
-
-void ShowCodeNode::AddSelfToTotalStats() {
-  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
-                                         proto().exec_micros());
-  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
-                                             proto().requested_bytes());
-  mutable_proto()->set_total_parameters(proto().total_parameters() +
-                                        proto().parameters());
-  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
-                                       proto().float_ops());
-}
-
-void ShowCodeNode::ResetTotalStats() {
-  mutable_proto()->set_total_exec_micros(0);
-  mutable_proto()->set_total_requested_bytes(0);
-  mutable_proto()->set_total_parameters(0);
-  mutable_proto()->set_total_float_ops(0);
-  mutable_proto()->mutable_children()->Clear();
-}
 
 const TFCodeNodeProto& TFShowCode::Show(const Options& opts) {
-  const ShowCodeNode* root = ShowInternal(opts);
-  if (opts.dump_to_file.empty()) {
-    printf("%s", root->formatted_str.c_str());
-    fflush(stdout);
-  } else {
-    Status s = WriteStringToFile(Env::Default(), opts.dump_to_file,
-                                 root->formatted_str);
+  if (opts.output_type == kOutput[0]) {
+    Timeline timeline(opts.output_options.at(kTimelineOpts[0]));
+    return ShowInternal(opts, &timeline)->proto();
+  } else if (opts.output_type == kOutput[2]) {
+    const ShowCodeNode* root = ShowInternal(opts, nullptr);
+    Status s =
+        WriteStringToFile(Env::Default(), opts.output_options.at(kFileOpts[0]),
+                          root->formatted_str);
     if (!s.ok()) {
       fprintf(stderr, "%s\n", s.ToString().c_str());
     }
+    return root->proto();
+  } else {
+    const ShowCodeNode* root = ShowInternal(opts, nullptr);
+    printf("%s", root->formatted_str.c_str());
+    fflush(stdout);
+    return root->proto();
   }
-  return root->proto();
 }
 
 bool TFShowCode::ShouldShow(ShowCodeNode* node, const Options& opts,
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_code.h b/tensorflow/tools/tfprof/internal/tfprof_show_code.h
index afe0fa4473..cbfd38945f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show_code.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_code.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Parent class and utilities for tfprof_graph and tfprof_scope.
+// Parent class and utilities for tfprof_code.
 
 #ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_CODE_H_
 #define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_CODE_H_
@@ -28,39 +28,15 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
 #include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
-class ShowCodeNode {
- public:
-  explicit ShowCodeNode(const TFCodeNode* node);
-  virtual ~ShowCodeNode() {}
-
-  const string& name() const { return node->name(); }
-  TFCodeNodeProto* mutable_proto();
-  const TFCodeNodeProto& proto() const;
-
-  string Format(const Options& opts);
-
-  string FormatMeta(const Options& opts);
-
-  const TFCodeNode* node;
-  bool account;
-  string formatted_str;
-
- protected:
-  void AggregateTotalStats(ShowCodeNode* node);
-
-  void AddSelfToTotalStats();
-
-  void ResetTotalStats();
-
-  TFCodeNodeProto proto_;
-};
 
 class TFShowCode {
  public:
@@ -71,7 +47,8 @@ class TFShowCode {
   const TFCodeNodeProto& Show(const Options& opts);
 
  protected:
-  virtual const ShowCodeNode* ShowInternal(const Options& opts) = 0;
+  virtual const ShowCodeNode* ShowInternal(const Options& opts,
+                                           Timeline* timeline) = 0;
 
   bool LookUpCheckPoint(const string& name,
                         std::unique_ptr<TFProfTensor>* tensor);
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
index ffaa576639..f0621c9af0 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
@@ -75,7 +75,7 @@ TEST_F(TFProfShowTest, DumpScopeMode) {
                {"VariableV2"},  // accout_type_regexes
                {".*"}, {""}, {".*"}, {""}, false,
                {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false, dump_file);
+               "file", {{"outfile", dump_file}});
   tf_stats_->PrintGraph("scope", opts);
 
   string dump_str;
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats.cc b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
index 13ff6e7246..566b4cee44 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -92,10 +94,16 @@ void TFStats::ParseGraph() {
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
     const NodeDef* node_def = it->second.node_def();
     for (string node_input : node_def->input()) {
+      int output_idx = 0;
       // input name format can be: "^node:src_output"
       auto prefix_pos = node_input.find(":");
       if (prefix_pos != node_input.npos) {
-        node_input.substr(0, prefix_pos);
+        std::vector<string> input_parts = str_util::Split(node_input, ":");
+        CHECK(input_parts.size() == 2)
+            << "Unknown NodeDef.input format: " << node_input;
+        node_input = input_parts[0];
+        CHECK(strings::safe_strto32(input_parts[1], &output_idx))
+            << "Failed to parse integer: " << output_idx;
       }
       if (node_input.substr(0, 1) == "^") {
         node_input = node_input.substr(1);
@@ -104,7 +112,7 @@ void TFStats::ParseGraph() {
       if (input_node == nodes_map_.end()) {
         continue;
       }
-      it->second.AddInput(&input_node->second);
+      it->second.AddInput(&input_node->second, output_idx);
     }
   }
 }
@@ -137,21 +145,6 @@ void TFStats::ParseRunMeta() {
       node->second.AddStepStat(dev_stat.device(), &node_stat);
     }
   }
-
-  if (!run_meta_->has_cost_graph()) {
-    fprintf(stderr,
-            "Missing CostGraphDef in RunMetadata.\nMaybe you forget to"
-            "set tf.ConfigProto(graph_options=tf.GraphOptions("
-            "build_cost_model=1)) to Session()\n");
-  } else {
-    for (const auto& node_pb : run_meta_->cost_graph().node()) {
-      auto node = nodes_map_.find(node_pb.name());
-      if (node == nodes_map_.end()) {
-        continue;
-      }
-      node->second.AddNodeStat(&node_pb);
-    }
-  }
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
index b913161f6a..eb01425e04 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
@@ -74,8 +74,8 @@ TEST_F(TFProfStatsTest, CustomOpType) {
   Options opts(3, 0, 0, 0, 0, {".*"}, "name",
                {kTrainableVarType},  // accout_type_regexes
                {".*"}, {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false);
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, "",
+               {});
   const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
   TFGraphNodeProto expected;
@@ -84,20 +84,20 @@ TEST_F(TFProfStatsTest, CustomOpType) {
       "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
       "370\nchildren {\n  name: \"conv2d/bias\"\n  exec_micros: 1\n  "
       "requested_bytes: 20\n  parameters: 5\n  total_exec_micros: 1\n  "
-      "total_requested_bytes: 20\n  total_parameters: 5\n  device: "
+      "total_requested_bytes: 20\n  total_parameters: 5\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
       "total_float_ops: 0\n}\nchildren {\n  name: \"conv2d/kernel\"\n  "
       "exec_micros: 1\n  requested_bytes: 540\n  parameters: 135\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 540\n  total_parameters: "
-      "135\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "135\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/bias\"\n  "
       "exec_micros: 1\n  requested_bytes: 20\n  parameters: 5\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 20\n  total_parameters: "
-      "5\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "5\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/kernel\"\n  "
       "exec_micros: 2\n  requested_bytes: 900\n  parameters: 225\n  "
       "total_exec_micros: 2\n  total_requested_bytes: 900\n  total_parameters: "
-      "225\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "225\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
@@ -107,7 +107,7 @@ TEST_F(TFProfStatsTest, CheckPointOpType) {
   Options opts(
       3, 0, 0, 0, 0, {".*"}, "name", {kCkptVarType},  // accout_type_regexes
       {".*"}, {""}, {".*"}, {""}, false,
-      {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, false);
+      {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, "", {});
   const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
   TFGraphNodeProto expected;
@@ -116,20 +116,20 @@ TEST_F(TFProfStatsTest, CheckPointOpType) {
       "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
       "370\nchildren {\n  name: \"conv2d/bias\"\n  exec_micros: 1\n  "
       "requested_bytes: 20\n  parameters: 5\n  total_exec_micros: 1\n  "
-      "total_requested_bytes: 20\n  total_parameters: 5\n  device: "
+      "total_requested_bytes: 20\n  total_parameters: 5\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
       "total_float_ops: 0\n}\nchildren {\n  name: \"conv2d/kernel\"\n  "
       "exec_micros: 1\n  requested_bytes: 540\n  parameters: 135\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 540\n  total_parameters: "
-      "135\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "135\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/bias\"\n  "
       "exec_micros: 1\n  requested_bytes: 20\n  parameters: 5\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 20\n  total_parameters: "
-      "5\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "5\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/kernel\"\n  "
       "exec_micros: 2\n  requested_bytes: 900\n  parameters: 225\n  "
       "total_exec_micros: 2\n  total_requested_bytes: 900\n  total_parameters: "
-      "225\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "225\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
@@ -139,8 +139,8 @@ TEST_F(TFProfStatsTest, TestGraph) {
   Options opts(100, 0, 10000, 0, 0, {".*"}, "name", {".*"},
                {"cost.*"},  // start_name_regexes
                {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false);
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, "",
+               {});
   const TFGraphNodeProto& root = tf_stats_->PrintGraph("graph", opts);
 
   TFGraphNodeProto expected;
@@ -154,7 +154,7 @@ TEST_F(TFProfStatsTest, TestGraph) {
 
 TEST_F(TFProfStatsTest, TestFloatOps) {
   Options opts(10, 0, 0, 0, 1, {".*"}, "name", {".*"}, {".*"}, {""}, {".*"},
-               {""}, false, {"float_ops"}, false);
+               {""}, false, {"float_ops"}, "", {});
   const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
   TFGraphNodeProto expected;
@@ -163,19 +163,19 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
       "0\ntotal_exec_micros: 96\ntotal_requested_bytes: "
       "8656\ntotal_parameters: 370\nchildren {\n  name: \"conv2d/BiasAdd\"\n  "
       "exec_micros: 12\n  requested_bytes: 1440\n  total_exec_micros: 12\n  "
-      "total_requested_bytes: 1440\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 1440\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 360\n  "
       "total_float_ops: 360\n}\nchildren {\n  name: \"conv2d/convolution\"\n  "
       "exec_micros: 60\n  requested_bytes: 1440\n  total_exec_micros: 60\n  "
-      "total_requested_bytes: 1440\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 1440\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 19440\n  "
       "total_float_ops: 19440\n}\nchildren {\n  name: \"conv2d_2/BiasAdd\"\n  "
       "exec_micros: 2\n  requested_bytes: 640\n  total_exec_micros: 2\n  "
-      "total_requested_bytes: 640\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 640\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 160\n  "
       "total_float_ops: 160\n}\nchildren {\n  name: \"conv2d_2/convolution\"\n "
       " exec_micros: 13\n  requested_bytes: 640\n  total_exec_micros: 13\n  "
-      "total_requested_bytes: 640\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 640\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 14400\n  "
       "total_float_ops: 14400\n}\nfloat_ops: 0\ntotal_float_ops: 34360\n",
       &expected));
@@ -186,7 +186,7 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
   Options opts(100, 0, 0, 0, 0, {".*"}, "name", {".*"}, {".*"}, {""},
                {"unit_2_1.*DW"},  // show_name_regexes.
                {""}, true,        // account_displayed_op_only.
-               {"params"}, false);
+               {"params"}, "", {});
   const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
   TFGraphNodeProto expected;
@@ -202,7 +202,7 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) {
   Options opts(10, 0, 0, 0, 0, {".*"}, "name", {".*"}, {".*"}, {""},
                {"unit_1_0.*gamma"}, {""}, false,
                {"tensor_value"},  // Show tensor value from checkpoint.
-               false);
+               "", {});
   const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
   TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
diff --git a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
index 95759f9d47..79a781210d 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
@@ -57,7 +57,7 @@ class TFProfTensorTest : public ::testing::Test {
 TEST_F(TFProfTensorTest, Basics) {
   Options opts(3, 0, 0, 0, 0, {".*"}, "name", {"VariableV2"}, {".*"}, {""},
                {".*"}, {""}, false, {"tensor_value"},  // show the tensor value.
-               false);
+               "", {});
   const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
   TFGraphNodeProto expected;
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline.cc b/tensorflow/tools/tfprof/internal/tfprof_timeline.cc
new file mode 100644
index 0000000000..a5640c0e56
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline.cc
@@ -0,0 +1,245 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+
+#include <utility>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
+                                              const string& category,
+                                              const string& name, int64 pid,
+                                              int64 tid, int64 ts) {
+  Json::Value event(Json::objectValue);
+  event["ph"] = Json::Value(ph);
+  event["cat"] = Json::Value(category);
+  event["name"] = Json::Value(name);
+  event["pid"] = Json::Value(pid);
+  event["tid"] = Json::Value(tid);
+  event["ts"] = Json::Value(ts);
+  return event;
+}
+
+void ChromeTraceFormatter::EmitPID(const string& name, int64 pid) {
+  Json::Value event(Json::objectValue);
+  event["name"] = Json::Value("process_name");
+  event["ph"] = Json::Value("M");
+  event["pid"] = Json::Value(pid);
+  Json::Value args(Json::objectValue);
+  args["name"] = Json::Value(name);
+  event["args"] = args;
+  metadata_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid,
+                                      int64 tid, const string& category,
+                                      const string& name, Json::Value args) {
+  Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
+  event["dur"] = Json::Value(duration);
+  event["args"] = std::move(args);
+  metadata_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitFlowStart(const string& name, int64 ts,
+                                         int64 pid, int64 tid, int64 flow_id) {
+  Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
+  event["id"] = flow_id;
+  events_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
+                                       int64 tid, int64 flow_id) {
+  Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
+  event["id"] = flow_id;
+  events_.push_back(event);
+}
+
+string ChromeTraceFormatter::Format() {
+  Json::Value trace;
+  trace["traceEvents"] = Json::Value(Json::arrayValue);
+  for (const Json::Value& v : metadata_) {
+    trace["traceEvents"].append(v);
+  }
+  for (const Json::Value& v : events_) {
+    trace["traceEvents"].append(v);
+  }
+  return trace.toStyledString();
+}
+
+void Timeline::GenerateGraphTimeline(const GraphNode* gnode) {
+  fprintf(stdout, "adding graph nodes.\n");
+  AddGraphNode(gnode);
+  AllocateLanes();
+  fprintf(stdout, "generating trace file.\n");
+  int64 flow_id = 1;
+  for (const auto& process : alloc_nodes_) {
+    for (const auto& lane : process.second) {
+      for (const auto& node : lane.second) {
+        TimeNode* tnode = node.second;
+
+        Json::Value args(Json::objectValue);
+        args["name"] = Json::Value(tnode->name);
+        args["op"] = Json::Value(tnode->name);
+        chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
+                                     process.first, lane.first, "Op",
+                                     tnode->name, args);
+
+        for (TimeNode* next_tnode : node.second->next_tnodes) {
+          chrome_formatter_.EmitFlowStart(
+              tnode->name + "_flow", tnode->start_micros + tnode->exec_micros,
+              process.first, lane.first, flow_id);
+          chrome_formatter_.EmitFlowEnd(
+              tnode->name + "_flow", next_tnode->start_micros,
+              next_tnode->process->pid, next_tnode->tid, flow_id);
+          flow_id += 1;
+        }
+      }
+    }
+  }
+  OutputTimeline();
+}
+
+void Timeline::GenerateScopeTimeline(const ScopeNode* node) {
+  std::set<int64> visited_depth;
+  EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
+  OutputTimeline();
+}
+
+void Timeline::GenerateCodeTimeline(const CodeNode* node) {
+  std::set<int64> visited_depth;
+  EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
+  OutputTimeline();
+}
+
+void Timeline::OutputTimeline() {
+  Status s =
+      WriteStringToFile(Env::Default(), outfile_, chrome_formatter_.Format());
+  if (!s.ok()) {
+    fprintf(stderr, "Failed to write timeline file: %s\nError: %s\n",
+            outfile_.c_str(), s.ToString().c_str());
+    return;
+  }
+  fprintf(stdout, "\n******************************************************\n");
+  fprintf(stdout,
+          "Timeline file is written to %s.\n"
+          "Open a Chrome browser, enter URL chrome://tracing and "
+          "load the timeline file.",
+          outfile_.c_str());
+  fprintf(stdout, "\n******************************************************\n");
+  fflush(stdout);
+}
+
+std::vector<TimeNode*> Timeline::AddGraphNode(const GraphNode* gnode) {
+  std::vector<TimeNode*> tnodes;
+  if (!gnode) return tnodes;
+
+  std::vector<TimeNode*> shown_cinputs;
+  for (GraphNode* schild : gnode->show_children) {
+    std::vector<TimeNode*> inputs = AddGraphNode(schild);
+    shown_cinputs.insert(shown_cinputs.end(), inputs.begin(), inputs.end());
+  }
+  if (!gnode->node->step_stats()) {
+    return shown_cinputs;
+  }
+
+  const TFGraphNode* node = gnode->node;
+  for (const auto& kernel_execs : node->op_kernel_execs()) {
+    const string& device = kernel_execs.first;
+    const std::vector<std::pair<int64, int64>>& execs = kernel_execs.second;
+
+    if (process_.find(device) == process_.end()) {
+      int64 pid = AllocatePID();
+      process_[device].reset(new Process(pid));
+      chrome_formatter_.EmitPID(device, pid);
+    }
+    Process* p = process_[device].get();
+
+    for (const auto& exec : execs) {
+      int64 start_micros = exec.first;
+      int64 exec_micros = exec.second;
+      // TODO(xpan): There might be start time duplication here.
+      if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
+        // TODO(xpan): Give each kernel call a unique_name.
+        tnodes_[device][start_micros].reset(
+            new TimeNode(p, node->name(), start_micros, exec_micros));
+      }
+      TimeNode* tnode_ptr = tnodes_[device][start_micros].get();
+
+      for (int i = 0; i < shown_cinputs.size(); i++) {
+        shown_cinputs[i]->next_tnodes.push_back(tnode_ptr);
+      }
+      tnodes.push_back(tnode_ptr);
+    }
+  }
+  return tnodes;
+}
+
+void Timeline::AllocateLanes() {
+  for (auto& process : tnodes_) {
+    Process* p = process_[process.first].get();
+    for (auto& tnode : process.second) {
+      int64 start_time = tnode.second->start_micros;
+      int64 end_time = tnode.second->exec_micros - 1;
+
+      int64 l = -1;
+      for (int i = 0; i < p->lanes.size(); ++i) {
+        const auto& lane = p->lanes[i];
+        auto cur_it = lane.lower_bound(start_time);
+        if (cur_it == lane.end()) {
+          --cur_it;
+        }
+        l = i;
+        for (; cur_it != lane.begin(); --cur_it) {
+          if (cur_it->second < start_time) {
+            break;
+          }
+          if (cur_it->first <= end_time) {
+            l = -1;
+            break;
+          }
+        }
+        if (l >= 0) {
+          break;
+        }
+      }
+      if (l < 0) {
+        l = p->lanes.size();
+        std::map<int64, int64> nlane;
+        nlane[start_time] = end_time;
+        p->lanes.push_back(nlane);
+      } else {
+        p->lanes[l][start_time] = end_time;
+      }
+      tnode.second->tid = l;
+      alloc_nodes_[p->pid][l][start_time] = tnode.second.get();
+    }
+  }
+}
+
+int64 Timeline::AllocatePID() {
+  int64 cur_pid = next_pid_;
+  next_pid_ += 1;
+  return cur_pid;
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline.h b/tensorflow/tools/tfprof/internal/tfprof_timeline.h
new file mode 100644
index 0000000000..3d26874abd
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline.h
@@ -0,0 +1,147 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
+
+#include "include/json/json.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+typedef std::map<string, string> Event;
+
+class ChromeTraceFormatter {
+ public:
+  ChromeTraceFormatter() {}
+
+  Json::Value CreateEvent(const string& ph, const string& category,
+                          const string& name, int64 pid, int64 tid, int64 ts);
+
+  void EmitPID(const string& name, int64 pid);
+
+  void EmitRegion(int64 ts, int64 duration, int64 pid, int64 tid,
+                  const string& category, const string& name, Json::Value args);
+
+  void EmitFlowStart(const string& name, int64 ts, int64 pid, int64 tid,
+                     int64 flow_id);
+
+  void EmitFlowEnd(const string& name, int64 ts, int64 pid, int64 tid,
+                   int64 flow_id);
+
+  string Format();
+
+ private:
+  std::vector<Json::Value> events_;
+  std::vector<Json::Value> metadata_;
+};
+
+class Process {
+ public:
+  Process(int64 pid) : pid(pid) {}
+
+  // Each lane is a map from start_time to end_time.
+  std::vector<std::map<int64, int64>> lanes;
+  int64 pid;
+};
+
+class TimeNode {
+ public:
+  TimeNode(Process* process, const string& name, int64 start_micros,
+           int64 exec_micros)
+      : process(process),
+        name(name),
+        start_micros(start_micros),
+        exec_micros(exec_micros),
+        tid(-1) {}
+  virtual ~TimeNode() {}
+
+  Process* process;
+  string name;
+  int64 start_micros;
+  int64 exec_micros;
+  int64 tid;
+  std::vector<TimeNode*> next_tnodes;
+};
+
+class Timeline {
+ public:
+  Timeline(const string& outfile) : outfile_(outfile) {}
+  ~Timeline() {}
+
+  void GenerateGraphTimeline(const GraphNode* gnode);
+
+  void GenerateScopeTimeline(const ScopeNode* node);
+
+  void GenerateCodeTimeline(const CodeNode* node);
+
+ private:
+  void OutputTimeline();
+
+  template <typename Node>
+  void EmitTreeNode(const Node* node, int64 start_time, int64 duration,
+                    int64 depth, std::set<int64>* visited_depth) {
+    if (visited_depth->find(depth) == visited_depth->end()) {
+      chrome_formatter_.EmitPID(strings::StrCat("Scope:", depth), depth);
+      visited_depth->insert(depth);
+    }
+
+    Json::Value args(Json::objectValue);
+    args["name"] = Json::Value(node->name());
+    args["op"] = Json::Value(node->name());
+    chrome_formatter_.EmitRegion(start_time, duration, depth, 0, "Op",
+                                 node->name(), args);
+
+    int64 total_micros = 0;
+    int64 c_start_time = start_time;
+    for (const Node* child : node->show_children) {
+      int64 total_exec_micros = child->proto().total_exec_micros();
+      if (total_exec_micros <= 0) {
+        continue;
+      }
+      EmitTreeNode(child, c_start_time, total_exec_micros, depth + 1,
+                   visited_depth);
+      c_start_time += total_exec_micros;
+      total_micros += total_exec_micros;
+    }
+    CHECK(total_micros <= duration) << node->name() << " parent:" << duration
+                                    << " children:" << total_micros;
+  }
+
+  std::vector<TimeNode*> AddGraphNode(const GraphNode* gnode);
+
+  void AllocateLanes();
+
+  int64 AllocatePID();
+
+  const string outfile_;
+  int64 next_pid_ = 0;
+  int64 allocator_pid_ = -1;
+  ChromeTraceFormatter chrome_formatter_;
+  std::map<string, int64> device_pids_;
+
+  std::map<string, std::unique_ptr<Process>> process_;
+  std::map<int64, std::map<int64, std::map<int64, TimeNode*>>> alloc_nodes_;
+  std::map<string, std::map<int64, std::unique_ptr<TimeNode>>> tnodes_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc b/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
new file mode 100644
index 0000000000..2dfe6ab335
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
+
+#include <utility>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+class TFProfTimelineTest : public ::testing::Test {
+ protected:
+  TFProfTimelineTest() {
+    string graph_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/graph.pbtxt");
+    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    TF_CHECK_OK(ReadGraphDef(Env::Default(), graph_path, graph_pb.get()));
+
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
+        new tensorflow::RunMetadata());
+    string run_meta_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/run_meta");
+    TF_CHECK_OK(
+        ReadBinaryProto(Env::Default(), run_meta_path, run_meta_pb.get()));
+
+    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
+                                nullptr, nullptr));
+  }
+
+  std::unique_ptr<TFStats> tf_stats_;
+};
+
+// Before adding test, first dump the json file and
+// manually check it's correct
+TEST_F(TFProfTimelineTest, GraphView) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(10000, 0, 0, 0, 0, {".*"}, "name",
+               {".*"},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
+               "timeline", {{"outfile", dump_file}});
+  tf_stats_->PrintGraph("graph", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(14171250174278825648ull, Hash64(dump_str));
+}
+
+TEST_F(TFProfTimelineTest, ScopeView) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(5, 0, 0, 0, 0, {".*"}, "name", {".*"},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
+               "timeline", {{"outfile", dump_file}});
+  tf_stats_->PrintGraph("scope", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(2355241164346147404ull, Hash64(dump_str));
+}
+
+// TODO(xpan): tfprof_log is too large to include in testdata when adding
+// code traces.
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_utils.cc b/tensorflow/tools/tfprof/internal/tfprof_utils.cc
index eb2be35c6e..8e55e009d3 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_utils.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_utils.cc
@@ -251,19 +251,13 @@ tensorflow::Status ParseCmdLine(const string& line, string* cmd,
       opts->select = requested_set;
       ++i;
     } else if (pieces[i] == tensorflow::tfprof::kOptions[14]) {
-      if ((pieces.size() > i + 1 && pieces[i + 1].find("-") == 0) ||
-          pieces.size() == i + 1) {
-        opts->viz = true;
-      } else if (!StringToBool(pieces[i + 1], &opts->viz)) {
-        return ReturnError(pieces, i);
-      } else {
-        ++i;
-      }
-    } else if (pieces[i] == tensorflow::tfprof::kOptions[15]) {
       if (pieces.size() <= i + 1) {
         return ReturnError(pieces, i);
       }
-      opts->dump_to_file = StripQuote(pieces[i + 1]);
+
+      tensorflow::Status s =
+          ParseOutput(pieces[i + 1], &opts->output_type, &opts->output_options);
+      if (!s.ok()) return s;
       ++i;
     } else {
       return ReturnError(pieces, i);
diff --git a/tensorflow/tools/tfprof/tfprof_main.cc b/tensorflow/tools/tfprof/tfprof_main.cc
index dd18a4ad3c..cfe239da22 100644
--- a/tensorflow/tools/tfprof/tfprof_main.cc
+++ b/tensorflow/tools/tfprof/tfprof_main.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
@@ -82,8 +83,7 @@ int main(int argc, char** argv) {
   tensorflow::string FLAGS_hide_name_regexes;
   bool FLAGS_account_displayed_op_only = false;
   tensorflow::string FLAGS_select = "params";
-  bool FLAGS_viz = false;
-  tensorflow::string FLAGS_dump_to_file = "";
+  tensorflow::string FLAGS_output = "";
   for (int i = 0; i < argc; i++) {
     fprintf(stderr, "%s\n", argv[i]);
   }
@@ -117,7 +117,7 @@ int main(int argc, char** argv) {
                        &FLAGS_account_displayed_op_only,
                        "account displayed op only"),
       tensorflow::Flag("select", &FLAGS_select, "select"),
-      tensorflow::Flag("dump_to_file", &FLAGS_dump_to_file, "dump to file"),
+      tensorflow::Flag("output", &FLAGS_output, "output"),
   };
   tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
@@ -144,6 +144,12 @@ int main(int argc, char** argv) {
   std::vector<tensorflow::string> select =
       Split(FLAGS_select, ',', tensorflow::str_util::SkipEmpty());
 
+  tensorflow::string output_type;
+  std::map<tensorflow::string, tensorflow::string> output_options;
+  tensorflow::Status s = tensorflow::tfprof::ParseOutput(
+      FLAGS_output, &output_type, &output_options);
+  CHECK(s.ok()) << s.ToString();
+
   tensorflow::string cmd = "";
   if (argc == 1 && FLAGS_graph_path.empty()) {
     printf("1) go/tfprof: Tutorial.\n");
@@ -186,10 +192,18 @@ int main(int argc, char** argv) {
 
   std::unique_ptr<tensorflow::tfprof::OpLog> op_log(
       new tensorflow::tfprof::OpLog());
-  if (!ReadBinaryProto(tensorflow::Env::Default(), FLAGS_op_log_path,
-                       op_log.get())
-           .ok()) {
-    op_log.release();
+  if (!FLAGS_op_log_path.empty()) {
+    tensorflow::string op_log_str;
+    s = tensorflow::ReadFileToString(tensorflow::Env::Default(),
+                                     FLAGS_op_log_path, &op_log_str);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to read op_log_path: %s\n", s.ToString().c_str());
+      return 1;
+    }
+    if (!tensorflow::ParseProtoUnlimited(op_log.get(), op_log_str)) {
+      fprintf(stderr, "Failed to parse op_log_path\n");
+      return 1;
+    }
   }
 
   std::unique_ptr<tensorflow::checkpoint::CheckpointReader> ckpt_reader;
@@ -212,8 +226,8 @@ int main(int argc, char** argv) {
       FLAGS_max_depth, FLAGS_min_bytes, FLAGS_min_micros, FLAGS_min_params,
       FLAGS_min_float_ops, device_regexes, FLAGS_order_by, account_type_regexes,
       start_name_regexes, trim_name_regexes, show_name_regexes,
-      hide_name_regexes, FLAGS_account_displayed_op_only, select, FLAGS_viz,
-      FLAGS_dump_to_file);
+      hide_name_regexes, FLAGS_account_displayed_op_only, select, output_type,
+      output_options);
 
   if (cmd == tensorflow::tfprof::kCmds[2]) {
     tf_stat.PrintCode(opts);
diff --git a/tensorflow/tools/tfprof/tfprof_options.proto b/tensorflow/tools/tfprof/tfprof_options.proto
index 9d269a0995..84a2e14005 100644
--- a/tensorflow/tools/tfprof/tfprof_options.proto
+++ b/tensorflow/tools/tfprof/tfprof_options.proto
@@ -19,6 +19,6 @@ message OptionsProto {
   repeated string hide_name_regexes = 12;
   optional bool account_displayed_op_only = 13;
   repeated string select = 14;
-  optional bool viz = 15;
+  optional string output = 15;
   optional string dump_to_file = 16;
 }
diff --git a/tensorflow/tools/tfprof/tfprof_output.proto b/tensorflow/tools/tfprof/tfprof_output.proto
index 78dd056662..93e6c1233c 100644
--- a/tensorflow/tools/tfprof/tfprof_output.proto
+++ b/tensorflow/tools/tfprof/tfprof_output.proto
@@ -31,7 +31,8 @@ message TFGraphNodeProto {
   // Number of inputs to the op.
   optional int64 inputs = 5;
   // Device the op is assigned to.
-  optional string device = 10;
+  // Since an op can fire multiple kernel calls, there can be multiple devices.
+  repeated string devices = 10;
 
   // The following are the aggregated stats from all accounted descendants and
   // the op itself. The actual descendants depend on the data structure used