aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-08-02 03:42:31 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-08-02 03:46:25 -0700
commit03061e4cbfc8128e632536eee0af1ebad922b5ba (patch)
treee2724b70c176d724adbb860f6de51b73c1cf30d5
parent1c8a34641982294a72053304f0ece2efcbc866dd (diff)
Use device and colocation stack metadata to improve an error message in ops.py. This change does not depend on the new config.experimental.client_handles_error_formatting flag. I also attempted to modify relevant interpolated error strings so an uninterpolated error message still read correclty if you removed the interpolation tokens.
PiperOrigin-RevId: 207075862
-rw-r--r--tensorflow/core/common_runtime/placer.cc4
-rw-r--r--tensorflow/core/common_runtime/placer_test.cc8
-rw-r--r--tensorflow/python/BUILD1
-rw-r--r--tensorflow/python/framework/error_interpolation.py96
-rw-r--r--tensorflow/python/framework/error_interpolation_test.py15
-rw-r--r--tensorflow/python/framework/ops.py48
-rw-r--r--tensorflow/python/framework/ops_test.py22
7 files changed, 132 insertions, 62 deletions
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 613470365d..d581f45a90 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -937,8 +937,8 @@ bool Placer::ClientHandlesErrorFormatting() const {
string Placer::RichNodeName(const Node* node) const {
string quoted_name = strings::StrCat("'", node->name(), "'");
if (ClientHandlesErrorFormatting()) {
- string file_and_line = error_format_tag(*node, "${file}:${line}");
- return strings::StrCat(quoted_name, " (defined at ", file_and_line, ")");
+ string file_and_line = error_format_tag(*node, "${defined_at}");
+ return strings::StrCat(quoted_name, file_and_line);
} else {
return quoted_name;
}
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index cede899842..87f2f2ceb9 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -1158,10 +1158,10 @@ TEST_F(PlacerTest, TestNonexistentGpuNoAllowSoftPlacementFormatTag) {
true);
Status s = Place(&g, &options);
EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
- EXPECT_TRUE(
- str_util::StrContains(s.error_message(),
- "Cannot assign a device for operation 'in'"
- " (defined at ^^node:in:${file}:${line}^^)"));
+ LOG(WARNING) << s.error_message();
+ EXPECT_TRUE(str_util::StrContains(s.error_message(),
+ "Cannot assign a device for operation 'in'"
+ "^^node:in:${defined_at}^^"));
}
// Test that the "Cannot assign a device" error message does not contain a
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 2ccaae4dcf..2b8110a999 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -836,6 +836,7 @@ py_library(
":control_flow_util",
":device",
":dtypes",
+ ":error_interpolation",
":op_def_registry",
":platform",
":registry",
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index 7719d03019..6e844e14b9 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -87,17 +87,18 @@ def _parse_message(message):
return seps, tags
-def _compute_device_summary_from_list(device_assignment_list, prefix=""):
+def _compute_device_summary_from_list(name, device_assignment_list, prefix=""):
"""Return a summary of an op's device function stack.
Args:
+ name: The name of the op.
device_assignment_list: The op._device_assignments list.
prefix: An optional string prefix used before each line of the multi-
line string returned by this function.
Returns:
A multi-line string similar to:
- Device assignments active during op creation:
+ Device assignments active during op 'foo' creation:
with tf.device(/cpu:0): <test_1.py:27>
with tf.device(some_func<foo.py, 123>): <test_2.py:38>
The first line will have no padding to its left by default. Subsequent
@@ -105,11 +106,13 @@ def _compute_device_summary_from_list(device_assignment_list, prefix=""):
to increase indentation.
"""
if not device_assignment_list:
- message = "No device assignments were active during op creation."
+ message = "No device assignments were active during op '%s' creation."
+ message %= name
return prefix + message
str_list = []
- str_list.append("%sDevice assignments active during op creation:" % prefix)
+ str_list.append("%sDevice assignments active during op '%s' creation:"
+ % (prefix, name))
for traceable_obj in device_assignment_list:
location_summary = "<{file}:{line}>".format(file=traceable_obj.filename,
@@ -127,17 +130,17 @@ def _compute_device_summary_from_list(device_assignment_list, prefix=""):
def _compute_device_assignment_summary_from_op(op, prefix=""):
- if not op:
- return ""
# pylint: disable=protected-access
- return _compute_device_summary_from_list(op._device_assignments, prefix)
+ return _compute_device_summary_from_list(op.name, op._device_assignments,
+ prefix)
# pylint: enable=protected-access
-def _compute_colocation_summary_from_dict(colocation_dict, prefix=""):
+def _compute_colocation_summary_from_dict(name, colocation_dict, prefix=""):
"""Return a summary of an op's colocation stack.
Args:
+ name: The op name.
colocation_dict: The op._colocation_dict.
prefix: An optional string prefix used before each line of the multi-
line string returned by this function.
@@ -152,20 +155,21 @@ def _compute_colocation_summary_from_dict(colocation_dict, prefix=""):
to increase indentation.
"""
if not colocation_dict:
- message = "No node-device colocations were active during op creation."
+ message = "No node-device colocations were active during op '%s' creation."
+ message %= name
return prefix + message
str_list = []
- str_list.append("%sNode-device colocations active during op creation:"
- % prefix)
+ str_list.append("%sNode-device colocations active during op '%s' creation:"
+ % (prefix, name))
- for name, location in colocation_dict.items():
+ for coloc_name, location in colocation_dict.items():
location_summary = "<{file}:{line}>".format(file=location.filename,
line=location.lineno)
subs = {
"prefix": prefix,
"indent": " ",
- "name": name,
+ "name": coloc_name,
"loc": location_summary,
}
str_list.append(
@@ -176,11 +180,8 @@ def _compute_colocation_summary_from_dict(colocation_dict, prefix=""):
def _compute_colocation_summary_from_op(op, prefix=""):
"""Fetch colocation file, line, and nesting and return a summary string."""
- if not op:
- return ""
- # pylint: disable=protected-access
- return _compute_colocation_summary_from_dict(op._colocation_dict, prefix)
- # pylint: enable=protected-access
+ return _compute_colocation_summary_from_dict(
+ op.name, op._colocation_dict, prefix) # pylint: disable=protected-access
def _find_index_of_defining_frame_for_op(op):
@@ -216,16 +217,14 @@ def _find_index_of_defining_frame_for_op(op):
def _get_defining_frame_from_op(op):
"""Find and return stack frame where op was defined."""
- frame = None
- if op:
- # pylint: disable=protected-access
- frame_index = _find_index_of_defining_frame_for_op(op)
- frame = op._traceback[frame_index]
- # pylint: enable=protected-access
+ frame_index = _find_index_of_defining_frame_for_op(op)
+ # pylint: disable=protected-access
+ frame = op._traceback[frame_index]
+ # pylint: enable=protected-access
return frame
-def _compute_field_dict(op):
+def compute_field_dict(op):
"""Return a dictionary mapping interpolation tokens to values.
Args:
@@ -237,32 +236,40 @@ def _compute_field_dict(op):
{
"file": "tool_utils.py",
"line": "124",
+ "defined_at": " (defined at tool_utils.py:124)",
"colocations":
'''Node-device colocations active during op creation:
with tf.colocate_with(test_node_1): <test_1.py:27>
with tf.colocate_with(test_node_2): <test_2.py:38>'''
+ "devices":
+ '''Device assignments active during op 'foo' creation:
+ with tf.device(/cpu:0): <test_1.py:27>
+ with tf.device(some_func<foo.py, 123>): <test_2.py:38>'''
+ "devs_and_colocs": A concatenation of colocations and devices, e.g.
+ '''Node-device colocations active during op creation:
+ with tf.colocate_with(test_node_1): <test_1.py:27>
+ with tf.colocate_with(test_node_2): <test_2.py:38>'''
+ Device assignments active during op 'foo' creation:
+ with tf.device(/cpu:0): <test_1.py:27>
+ with tf.device(some_func<foo.py, 123>): <test_2.py:38>'''
}
- If op is None or lacks a _traceback field, the returned values will be
- "<NA>".
"""
- default_value = "<NA>"
- field_dict = {
- "file": default_value,
- "line": default_value,
- "colocations": default_value,
- "devices": default_value,
- }
frame = _get_defining_frame_from_op(op)
- if frame:
- field_dict["file"] = frame[tf_stack.TB_FILENAME]
- field_dict["line"] = frame[tf_stack.TB_LINENO]
+ filename = frame[tf_stack.TB_FILENAME]
+ lineno = frame[tf_stack.TB_LINENO]
+ defined_at = " (defined at %s:%d)" % (filename, lineno)
colocation_summary = _compute_colocation_summary_from_op(op)
- if colocation_summary:
- field_dict["colocations"] = colocation_summary
device_summary = _compute_device_assignment_summary_from_op(op)
- if device_summary:
- field_dict["devices"] = device_summary
+ combined_summary = "\n".join([colocation_summary, device_summary])
+ field_dict = {
+ "file": filename,
+ "line": lineno,
+ "defined_at": defined_at,
+ "colocations": colocation_summary,
+ "devices": device_summary,
+ "devs_and_colocs": combined_summary,
+ }
return field_dict
@@ -291,7 +298,12 @@ def interpolate(error_message, graph):
except KeyError:
op = None
- node_name_to_substitution_dict[name] = _compute_field_dict(op)
+ if op is not None:
+ field_dict = compute_field_dict(op)
+ else:
+ msg = "<NA>"
+ field_dict = collections.defaultdict(lambda s=msg: s)
+ node_name_to_substitution_dict[name] = field_dict
subs = [
string.Template(tag.format).safe_substitute(
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index fbf182879b..0427156b2b 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -71,8 +71,9 @@ class ComputeDeviceSummaryFromOpTest(test.TestCase):
lineno=42))
summary = error_interpolation._compute_device_summary_from_list(
- assignments, prefix=" ")
+ "nodename", assignments, prefix=" ")
+ self.assertIn("nodename", summary)
self.assertIn("tf.device(/cpu:0)", summary)
self.assertIn("<hope.py:24>", summary)
self.assertIn("tf.device(/gpu:2)", summary)
@@ -81,7 +82,8 @@ class ComputeDeviceSummaryFromOpTest(test.TestCase):
def testCorrectFormatWhenNoColocationsWereActive(self):
device_assignment_list = []
summary = error_interpolation._compute_device_summary_from_list(
- device_assignment_list, prefix=" ")
+ "nodename", device_assignment_list, prefix=" ")
+ self.assertIn("nodename", summary)
self.assertIn("No device assignments", summary)
@@ -99,7 +101,8 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
"test_node_2": t_obj_2,
}
summary = error_interpolation._compute_colocation_summary_from_dict(
- colocation_dict, prefix=" ")
+ "node_name", colocation_dict, prefix=" ")
+ self.assertIn("node_name", summary)
self.assertIn("colocate_with(test_node_1)", summary)
self.assertIn("<test_1.py:27>", summary)
self.assertIn("colocate_with(test_node_2)", summary)
@@ -108,7 +111,8 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
def testCorrectFormatWhenNoColocationsWereActive(self):
colocation_dict = {}
summary = error_interpolation._compute_colocation_summary_from_dict(
- colocation_dict, prefix=" ")
+ "node_name", colocation_dict, prefix=" ")
+ self.assertIn("node_name", summary)
self.assertIn("No node-device colocations", summary)
@@ -176,7 +180,7 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
one_tag_string = "^^node:MinusOne:${file}^^"
interpolated_string = error_interpolation.interpolate(one_tag_string,
self.graph)
- self.assertEqual(interpolated_string, "<NA>")
+ self.assertEqual("<NA>", interpolated_string)
def testTwoTagsNoSeps(self):
two_tags_no_seps = "^^node:One:${file}^^^^node:Three:${line}^^"
@@ -287,7 +291,6 @@ class InterpolateColocationSummaryTest(test.TestCase):
message = "^^node:One:${colocations}^^"
result = error_interpolation.interpolate(message, self.graph)
self.assertIn("No node-device colocations", result)
- self.assertNotIn("One", result)
self.assertNotIn("Two", result)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index c25e29b0f4..cf8689895f 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -44,6 +44,7 @@ from tensorflow.python.framework import c_api_util
from tensorflow.python.framework import cpp_shape_inference_pb2
from tensorflow.python.framework import device as pydev
from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import error_interpolation
from tensorflow.python.framework import errors
from tensorflow.python.framework import op_def_registry
from tensorflow.python.framework import registry
@@ -3292,6 +3293,36 @@ class Graph(object):
self._create_op_helper(ret, compute_device=compute_device)
return ret
+ def _make_colocation_conflict_message(self, op, colocation_op):
+ """Return detailed error message about device conflict due to colocation."""
+ # Example error message:
+ # Tried to colocate op 'a' (defined at file1.py:149) having device
+ # '/device:GPU:0' with op 'b' (defined at file2:96) which had an
+ # incompatible device '/device:CPU:0'.
+ #
+ # No node-device colocations were active during op 'a' creation.
+ # Device assignments active during op 'a' creation:
+ # with tf.device(/device:GPU:0): file1.py:148>
+ #
+ # Node-device colocations active during op 'b' creation:
+ # with tf.colocate_with(a): file2.py:93>
+ # Device assignments active during op 'b' creation:
+ # with tf.device(/cpu:0): file2.py:94
+ op_info = error_interpolation.compute_field_dict(op)
+ coloc_op_info = error_interpolation.compute_field_dict(colocation_op)
+ msg = ("Tried to colocate op '{op_name}'{op_loc} having device '{op_dev}' "
+ "with op '{coloc_op_name}'{coloc_op_loc} which had an incompatible "
+ "device '{coloc_op_dev}'.\n\n{op_summary}\n\n{coloc_op_summary}"
+ .format(op_name=op.name,
+ op_loc=op_info["defined_at"],
+ op_dev=op.device,
+ op_summary=op_info["devs_and_colocs"],
+ coloc_op_name=colocation_op.name,
+ coloc_op_loc=coloc_op_info["defined_at"],
+ coloc_op_dev=colocation_op.device,
+ coloc_op_summary=coloc_op_info["devs_and_colocs"]))
+ return msg
+
def _create_op_helper(self, op, compute_device=True):
"""Common logic for creating an op in this graph."""
# Apply any additional attributes requested. Do not overwrite any existing
@@ -3332,20 +3363,22 @@ class Graph(object):
if compute_device:
self._apply_device_functions(op)
+ # Snapshot the colocation stack metadata before we might generate error
+ # messages using it. Note that this snapshot depends on the actual stack
+ # and is independent of the op's _class attribute.
+ # pylint: disable=protected-access
+ op._colocation_code_locations = self._snapshot_colocation_stack_metadata()
+ # pylint: enable=protected-access
+
if self._colocation_stack:
all_colocation_groups = []
for colocation_op in self._colocation_stack.peek_objs():
all_colocation_groups.extend(colocation_op.colocation_groups())
if colocation_op.device:
- # Make this device match the device of the colocated op, to provide
- # consistency between the device and the colocation property.
if (op.device and pydev.canonical_name(op.device) !=
pydev.canonical_name(colocation_op.device)):
- logging.warning("Tried to colocate %s with an op %s that had "
- "a different device: %s vs %s. Postponing "
- "error-checking until all devices are assigned.",
- op.name, colocation_op.name, op.device,
- colocation_op.device)
+ msg = self._make_colocation_conflict_message(op, colocation_op)
+ logging.warning(msg)
else:
op._set_device(colocation_op.device) # pylint: disable=protected-access
@@ -3353,7 +3386,6 @@ class Graph(object):
# pylint: disable=protected-access
op._set_attr("_class", attr_value_pb2.AttrValue(
list=attr_value_pb2.AttrValue.ListValue(s=all_colocation_groups)))
- op._colocation_code_locations = self._snapshot_colocation_stack_metadata()
# pylint: enable=protected-access
# Sets "container" attribute if
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 48328a7f58..318387c61b 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -2728,6 +2728,28 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
self.assertEqual("/device:CPU:0", b.device)
+ def testMakeColocationConflictMessage(self):
+ """Test that provides an example of a complicated error message."""
+ # We could test the message with any ops, but this test will be more
+ # instructive with a real colocation conflict.
+ with ops.device("/device:GPU:0"):
+ a = constant_op.constant([2.0], name="a")
+ with ops.colocate_with(a.op):
+ with ops.device("/cpu:0"):
+ b = constant_op.constant([3.0], name="b")
+ # The definition-location of the nodes will be wrong because of running
+ # from within a TF unittest. The rest of the info should be correct.
+ message = ops.get_default_graph()._make_colocation_conflict_message(a.op,
+ b.op)
+ self.assertRegexpMatches(message,
+ r"Tried to colocate op 'a' \(defined at.*\)")
+ self.assertRegexpMatches(message, "No node-device.*'a'")
+ self.assertRegexpMatches(message, "Device assignments active.*'a'")
+ self.assertRegexpMatches(message, "GPU:0")
+ self.assertRegexpMatches(message, "Node-device colocations active.*'b'")
+ self.assertRegexpMatches(message, "Device assignments active.*'b'")
+ self.assertRegexpMatches(message, "cpu:0")
+
class DeprecatedTest(test_util.TensorFlowTestCase):