diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2018-08-02 03:42:31 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-08-02 03:46:25 -0700 |
commit | 03061e4cbfc8128e632536eee0af1ebad922b5ba (patch) | |
tree | e2724b70c176d724adbb860f6de51b73c1cf30d5 | |
parent | 1c8a34641982294a72053304f0ece2efcbc866dd (diff) |
Use device and colocation stack metadata to improve an error message in ops.py. This change does not depend on the new config.experimental.client_handles_error_formatting flag. I also attempted to modify relevant interpolated error strings so an uninterpolated error message still read correclty if you removed the interpolation tokens.
PiperOrigin-RevId: 207075862
-rw-r--r-- | tensorflow/core/common_runtime/placer.cc | 4 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/placer_test.cc | 8 | ||||
-rw-r--r-- | tensorflow/python/BUILD | 1 | ||||
-rw-r--r-- | tensorflow/python/framework/error_interpolation.py | 96 | ||||
-rw-r--r-- | tensorflow/python/framework/error_interpolation_test.py | 15 | ||||
-rw-r--r-- | tensorflow/python/framework/ops.py | 48 | ||||
-rw-r--r-- | tensorflow/python/framework/ops_test.py | 22 |
7 files changed, 132 insertions, 62 deletions
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc index 613470365d..d581f45a90 100644 --- a/tensorflow/core/common_runtime/placer.cc +++ b/tensorflow/core/common_runtime/placer.cc @@ -937,8 +937,8 @@ bool Placer::ClientHandlesErrorFormatting() const { string Placer::RichNodeName(const Node* node) const { string quoted_name = strings::StrCat("'", node->name(), "'"); if (ClientHandlesErrorFormatting()) { - string file_and_line = error_format_tag(*node, "${file}:${line}"); - return strings::StrCat(quoted_name, " (defined at ", file_and_line, ")"); + string file_and_line = error_format_tag(*node, "${defined_at}"); + return strings::StrCat(quoted_name, file_and_line); } else { return quoted_name; } diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc index cede899842..87f2f2ceb9 100644 --- a/tensorflow/core/common_runtime/placer_test.cc +++ b/tensorflow/core/common_runtime/placer_test.cc @@ -1158,10 +1158,10 @@ TEST_F(PlacerTest, TestNonexistentGpuNoAllowSoftPlacementFormatTag) { true); Status s = Place(&g, &options); EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); - EXPECT_TRUE( - str_util::StrContains(s.error_message(), - "Cannot assign a device for operation 'in'" - " (defined at ^^node:in:${file}:${line}^^)")); + LOG(WARNING) << s.error_message(); + EXPECT_TRUE(str_util::StrContains(s.error_message(), + "Cannot assign a device for operation 'in'" + "^^node:in:${defined_at}^^")); } // Test that the "Cannot assign a device" error message does not contain a diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 2ccaae4dcf..2b8110a999 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -836,6 +836,7 @@ py_library( ":control_flow_util", ":device", ":dtypes", + ":error_interpolation", ":op_def_registry", ":platform", ":registry", diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py index 7719d03019..6e844e14b9 100644 --- a/tensorflow/python/framework/error_interpolation.py +++ b/tensorflow/python/framework/error_interpolation.py @@ -87,17 +87,18 @@ def _parse_message(message): return seps, tags -def _compute_device_summary_from_list(device_assignment_list, prefix=""): +def _compute_device_summary_from_list(name, device_assignment_list, prefix=""): """Return a summary of an op's device function stack. Args: + name: The name of the op. device_assignment_list: The op._device_assignments list. prefix: An optional string prefix used before each line of the multi- line string returned by this function. Returns: A multi-line string similar to: - Device assignments active during op creation: + Device assignments active during op 'foo' creation: with tf.device(/cpu:0): <test_1.py:27> with tf.device(some_func<foo.py, 123>): <test_2.py:38> The first line will have no padding to its left by default. Subsequent @@ -105,11 +106,13 @@ def _compute_device_summary_from_list(device_assignment_list, prefix=""): to increase indentation. """ if not device_assignment_list: - message = "No device assignments were active during op creation." + message = "No device assignments were active during op '%s' creation." + message %= name return prefix + message str_list = [] - str_list.append("%sDevice assignments active during op creation:" % prefix) + str_list.append("%sDevice assignments active during op '%s' creation:" + % (prefix, name)) for traceable_obj in device_assignment_list: location_summary = "<{file}:{line}>".format(file=traceable_obj.filename, @@ -127,17 +130,17 @@ def _compute_device_summary_from_list(device_assignment_list, prefix=""): def _compute_device_assignment_summary_from_op(op, prefix=""): - if not op: - return "" # pylint: disable=protected-access - return _compute_device_summary_from_list(op._device_assignments, prefix) + return _compute_device_summary_from_list(op.name, op._device_assignments, + prefix) # pylint: enable=protected-access -def _compute_colocation_summary_from_dict(colocation_dict, prefix=""): +def _compute_colocation_summary_from_dict(name, colocation_dict, prefix=""): """Return a summary of an op's colocation stack. Args: + name: The op name. colocation_dict: The op._colocation_dict. prefix: An optional string prefix used before each line of the multi- line string returned by this function. @@ -152,20 +155,21 @@ def _compute_colocation_summary_from_dict(colocation_dict, prefix=""): to increase indentation. """ if not colocation_dict: - message = "No node-device colocations were active during op creation." + message = "No node-device colocations were active during op '%s' creation." + message %= name return prefix + message str_list = [] - str_list.append("%sNode-device colocations active during op creation:" - % prefix) + str_list.append("%sNode-device colocations active during op '%s' creation:" + % (prefix, name)) - for name, location in colocation_dict.items(): + for coloc_name, location in colocation_dict.items(): location_summary = "<{file}:{line}>".format(file=location.filename, line=location.lineno) subs = { "prefix": prefix, "indent": " ", - "name": name, + "name": coloc_name, "loc": location_summary, } str_list.append( @@ -176,11 +180,8 @@ def _compute_colocation_summary_from_dict(colocation_dict, prefix=""): def _compute_colocation_summary_from_op(op, prefix=""): """Fetch colocation file, line, and nesting and return a summary string.""" - if not op: - return "" - # pylint: disable=protected-access - return _compute_colocation_summary_from_dict(op._colocation_dict, prefix) - # pylint: enable=protected-access + return _compute_colocation_summary_from_dict( + op.name, op._colocation_dict, prefix) # pylint: disable=protected-access def _find_index_of_defining_frame_for_op(op): @@ -216,16 +217,14 @@ def _find_index_of_defining_frame_for_op(op): def _get_defining_frame_from_op(op): """Find and return stack frame where op was defined.""" - frame = None - if op: - # pylint: disable=protected-access - frame_index = _find_index_of_defining_frame_for_op(op) - frame = op._traceback[frame_index] - # pylint: enable=protected-access + frame_index = _find_index_of_defining_frame_for_op(op) + # pylint: disable=protected-access + frame = op._traceback[frame_index] + # pylint: enable=protected-access return frame -def _compute_field_dict(op): +def compute_field_dict(op): """Return a dictionary mapping interpolation tokens to values. Args: @@ -237,32 +236,40 @@ def _compute_field_dict(op): { "file": "tool_utils.py", "line": "124", + "defined_at": " (defined at tool_utils.py:124)", "colocations": '''Node-device colocations active during op creation: with tf.colocate_with(test_node_1): <test_1.py:27> with tf.colocate_with(test_node_2): <test_2.py:38>''' + "devices": + '''Device assignments active during op 'foo' creation: + with tf.device(/cpu:0): <test_1.py:27> + with tf.device(some_func<foo.py, 123>): <test_2.py:38>''' + "devs_and_colocs": A concatenation of colocations and devices, e.g. + '''Node-device colocations active during op creation: + with tf.colocate_with(test_node_1): <test_1.py:27> + with tf.colocate_with(test_node_2): <test_2.py:38>''' + Device assignments active during op 'foo' creation: + with tf.device(/cpu:0): <test_1.py:27> + with tf.device(some_func<foo.py, 123>): <test_2.py:38>''' } - If op is None or lacks a _traceback field, the returned values will be - "<NA>". """ - default_value = "<NA>" - field_dict = { - "file": default_value, - "line": default_value, - "colocations": default_value, - "devices": default_value, - } frame = _get_defining_frame_from_op(op) - if frame: - field_dict["file"] = frame[tf_stack.TB_FILENAME] - field_dict["line"] = frame[tf_stack.TB_LINENO] + filename = frame[tf_stack.TB_FILENAME] + lineno = frame[tf_stack.TB_LINENO] + defined_at = " (defined at %s:%d)" % (filename, lineno) colocation_summary = _compute_colocation_summary_from_op(op) - if colocation_summary: - field_dict["colocations"] = colocation_summary device_summary = _compute_device_assignment_summary_from_op(op) - if device_summary: - field_dict["devices"] = device_summary + combined_summary = "\n".join([colocation_summary, device_summary]) + field_dict = { + "file": filename, + "line": lineno, + "defined_at": defined_at, + "colocations": colocation_summary, + "devices": device_summary, + "devs_and_colocs": combined_summary, + } return field_dict @@ -291,7 +298,12 @@ def interpolate(error_message, graph): except KeyError: op = None - node_name_to_substitution_dict[name] = _compute_field_dict(op) + if op is not None: + field_dict = compute_field_dict(op) + else: + msg = "<NA>" + field_dict = collections.defaultdict(lambda s=msg: s) + node_name_to_substitution_dict[name] = field_dict subs = [ string.Template(tag.format).safe_substitute( diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py index fbf182879b..0427156b2b 100644 --- a/tensorflow/python/framework/error_interpolation_test.py +++ b/tensorflow/python/framework/error_interpolation_test.py @@ -71,8 +71,9 @@ class ComputeDeviceSummaryFromOpTest(test.TestCase): lineno=42)) summary = error_interpolation._compute_device_summary_from_list( - assignments, prefix=" ") + "nodename", assignments, prefix=" ") + self.assertIn("nodename", summary) self.assertIn("tf.device(/cpu:0)", summary) self.assertIn("<hope.py:24>", summary) self.assertIn("tf.device(/gpu:2)", summary) @@ -81,7 +82,8 @@ class ComputeDeviceSummaryFromOpTest(test.TestCase): def testCorrectFormatWhenNoColocationsWereActive(self): device_assignment_list = [] summary = error_interpolation._compute_device_summary_from_list( - device_assignment_list, prefix=" ") + "nodename", device_assignment_list, prefix=" ") + self.assertIn("nodename", summary) self.assertIn("No device assignments", summary) @@ -99,7 +101,8 @@ class ComputeColocationSummaryFromOpTest(test.TestCase): "test_node_2": t_obj_2, } summary = error_interpolation._compute_colocation_summary_from_dict( - colocation_dict, prefix=" ") + "node_name", colocation_dict, prefix=" ") + self.assertIn("node_name", summary) self.assertIn("colocate_with(test_node_1)", summary) self.assertIn("<test_1.py:27>", summary) self.assertIn("colocate_with(test_node_2)", summary) @@ -108,7 +111,8 @@ class ComputeColocationSummaryFromOpTest(test.TestCase): def testCorrectFormatWhenNoColocationsWereActive(self): colocation_dict = {} summary = error_interpolation._compute_colocation_summary_from_dict( - colocation_dict, prefix=" ") + "node_name", colocation_dict, prefix=" ") + self.assertIn("node_name", summary) self.assertIn("No node-device colocations", summary) @@ -176,7 +180,7 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase): one_tag_string = "^^node:MinusOne:${file}^^" interpolated_string = error_interpolation.interpolate(one_tag_string, self.graph) - self.assertEqual(interpolated_string, "<NA>") + self.assertEqual("<NA>", interpolated_string) def testTwoTagsNoSeps(self): two_tags_no_seps = "^^node:One:${file}^^^^node:Three:${line}^^" @@ -287,7 +291,6 @@ class InterpolateColocationSummaryTest(test.TestCase): message = "^^node:One:${colocations}^^" result = error_interpolation.interpolate(message, self.graph) self.assertIn("No node-device colocations", result) - self.assertNotIn("One", result) self.assertNotIn("Two", result) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index c25e29b0f4..cf8689895f 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -44,6 +44,7 @@ from tensorflow.python.framework import c_api_util from tensorflow.python.framework import cpp_shape_inference_pb2 from tensorflow.python.framework import device as pydev from tensorflow.python.framework import dtypes +from tensorflow.python.framework import error_interpolation from tensorflow.python.framework import errors from tensorflow.python.framework import op_def_registry from tensorflow.python.framework import registry @@ -3292,6 +3293,36 @@ class Graph(object): self._create_op_helper(ret, compute_device=compute_device) return ret + def _make_colocation_conflict_message(self, op, colocation_op): + """Return detailed error message about device conflict due to colocation.""" + # Example error message: + # Tried to colocate op 'a' (defined at file1.py:149) having device + # '/device:GPU:0' with op 'b' (defined at file2:96) which had an + # incompatible device '/device:CPU:0'. + # + # No node-device colocations were active during op 'a' creation. + # Device assignments active during op 'a' creation: + # with tf.device(/device:GPU:0): file1.py:148> + # + # Node-device colocations active during op 'b' creation: + # with tf.colocate_with(a): file2.py:93> + # Device assignments active during op 'b' creation: + # with tf.device(/cpu:0): file2.py:94 + op_info = error_interpolation.compute_field_dict(op) + coloc_op_info = error_interpolation.compute_field_dict(colocation_op) + msg = ("Tried to colocate op '{op_name}'{op_loc} having device '{op_dev}' " + "with op '{coloc_op_name}'{coloc_op_loc} which had an incompatible " + "device '{coloc_op_dev}'.\n\n{op_summary}\n\n{coloc_op_summary}" + .format(op_name=op.name, + op_loc=op_info["defined_at"], + op_dev=op.device, + op_summary=op_info["devs_and_colocs"], + coloc_op_name=colocation_op.name, + coloc_op_loc=coloc_op_info["defined_at"], + coloc_op_dev=colocation_op.device, + coloc_op_summary=coloc_op_info["devs_and_colocs"])) + return msg + def _create_op_helper(self, op, compute_device=True): """Common logic for creating an op in this graph.""" # Apply any additional attributes requested. Do not overwrite any existing @@ -3332,20 +3363,22 @@ class Graph(object): if compute_device: self._apply_device_functions(op) + # Snapshot the colocation stack metadata before we might generate error + # messages using it. Note that this snapshot depends on the actual stack + # and is independent of the op's _class attribute. + # pylint: disable=protected-access + op._colocation_code_locations = self._snapshot_colocation_stack_metadata() + # pylint: enable=protected-access + if self._colocation_stack: all_colocation_groups = [] for colocation_op in self._colocation_stack.peek_objs(): all_colocation_groups.extend(colocation_op.colocation_groups()) if colocation_op.device: - # Make this device match the device of the colocated op, to provide - # consistency between the device and the colocation property. if (op.device and pydev.canonical_name(op.device) != pydev.canonical_name(colocation_op.device)): - logging.warning("Tried to colocate %s with an op %s that had " - "a different device: %s vs %s. Postponing " - "error-checking until all devices are assigned.", - op.name, colocation_op.name, op.device, - colocation_op.device) + msg = self._make_colocation_conflict_message(op, colocation_op) + logging.warning(msg) else: op._set_device(colocation_op.device) # pylint: disable=protected-access @@ -3353,7 +3386,6 @@ class Graph(object): # pylint: disable=protected-access op._set_attr("_class", attr_value_pb2.AttrValue( list=attr_value_pb2.AttrValue.ListValue(s=all_colocation_groups))) - op._colocation_code_locations = self._snapshot_colocation_stack_metadata() # pylint: enable=protected-access # Sets "container" attribute if diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py index 48328a7f58..318387c61b 100644 --- a/tensorflow/python/framework/ops_test.py +++ b/tensorflow/python/framework/ops_test.py @@ -2728,6 +2728,28 @@ class ColocationGroupTest(test_util.TensorFlowTestCase): self.assertEqual("/device:CPU:0", b.device) + def testMakeColocationConflictMessage(self): + """Test that provides an example of a complicated error message.""" + # We could test the message with any ops, but this test will be more + # instructive with a real colocation conflict. + with ops.device("/device:GPU:0"): + a = constant_op.constant([2.0], name="a") + with ops.colocate_with(a.op): + with ops.device("/cpu:0"): + b = constant_op.constant([3.0], name="b") + # The definition-location of the nodes will be wrong because of running + # from within a TF unittest. The rest of the info should be correct. + message = ops.get_default_graph()._make_colocation_conflict_message(a.op, + b.op) + self.assertRegexpMatches(message, + r"Tried to colocate op 'a' \(defined at.*\)") + self.assertRegexpMatches(message, "No node-device.*'a'") + self.assertRegexpMatches(message, "Device assignments active.*'a'") + self.assertRegexpMatches(message, "GPU:0") + self.assertRegexpMatches(message, "Node-device colocations active.*'b'") + self.assertRegexpMatches(message, "Device assignments active.*'b'") + self.assertRegexpMatches(message, "cpu:0") + class DeprecatedTest(test_util.TensorFlowTestCase): |