tfdbg core: add configurable attributes to debug ops, DebugNumericSummary

Added three attributes to the debug op "DebugNumericSummary" used in tfdbg-based TensorBoard health pills: 1) lower_bound (type: float) 2) upper_bound (type: float) 3) mute_if_healthy (type: bool) lower_bound and upper_bound make it possible to customize thresholds beyond which tensor elements are counted as -inf or inf. mute_if_healthy makes it possible to mute a DebugNumericSummary op unless there are nan, -inf or inf elements in the watched tensor, which is useful for reducing the amount of health pill data. Changes are made in the C++ DebugNodeInserter class, so that these attributes can be directly set from Python methods such as tf_debug.watch_graph() using the following syntax in the debug_ops argument: debug_ops=["DebugNumericSummary(attribute_name=attribute_value)"] e.g., debug_ops=["DebugNumericSummary(lower_bound=-100.0; mute_if_healthy=true)"] Currently, string, float, int, and bool attribute value types are supported. Change: 150665493
author: Shanqing Cai <cais@google.com> 2017-03-20 12:11:05 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-03-20 13:39:28 -0700
commit: 3288f2eee7140e4a97c5976417fcbab5fe28a05c (patch)
tree: 1e1f18dbaf6fe63f566a064bbbc0ef30177a162d /tensorflow
parent: a7e5032f4d5cb054d86e0c7f2b8aaab293b43d43 (diff)
9 files changed, 594 insertions, 23 deletions
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 4b13171c97..2035922fdc 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -207,6 +207,21 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "debug_graph_utils_test",
+    size = "small",
+    srcs = ["debug_graph_utils_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":debug_graph_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 # TODO(cais): Add the following back in when tfdbg is supported on Android.
 # filegroup(
 #     name = "android_srcs",
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 6ae5672860..b0982ec7d8 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -317,6 +317,120 @@ Status DebugNodeInserter::CreateCopyNode(
 }
 
 // static
+Status DebugNodeInserter::ParseDebugOpName(
+    const string& debug_op_name, string* debug_op_name_proper,
+    std::unordered_map<string, string>* attributes) {
+  const size_t l_index = debug_op_name.find('(');
+  const size_t r_index = debug_op_name.find(')');
+  if (l_index == string::npos && r_index == string::npos) {
+    *debug_op_name_proper = debug_op_name;
+  } else {
+    if (l_index == string::npos || l_index == 0 ||
+        r_index != debug_op_name.size() - 1) {
+      return errors::InvalidArgument("Malformed debug op name \"",
+                                     debug_op_name, "\"");
+    }
+
+    *debug_op_name_proper = debug_op_name.substr(0, l_index);
+    string arguments = debug_op_name.substr(l_index + 1, r_index - l_index - 1);
+
+    std::vector<string> attribute_segs = str_util::Split(arguments, ";");
+    for (const string& attribute_seg : attribute_segs) {
+      StringPiece seg(attribute_seg);
+      str_util::RemoveWhitespaceContext(&seg);
+      if (seg.empty()) {
+        continue;
+      }
+
+      const size_t eq_index = seg.find('=');
+      if (eq_index == string::npos) {
+        return errors::InvalidArgument(
+            "Malformed attributes in debug op name \"", debug_op_name, "\"");
+      }
+
+      const string key = seg.substr(0, eq_index).ToString();
+      const string value =
+          seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1)
+              .ToString();
+      if (key.empty() || value.empty()) {
+        return errors::InvalidArgument(
+            "Malformed attributes in debug op name \"", debug_op_name, "\"");
+      }
+
+      if (attributes->find(key) == attributes->end()) {
+        (*attributes)[key] = value;
+      } else {
+        return errors::InvalidArgument("Duplicate attribute name \"", key,
+                                       "\" found in the debug op: \"",
+                                       debug_op_name, "\"");
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// static
+Status DebugNodeInserter::SetDebugNodeAttributes(
+    Node* debug_node, const std::unordered_map<string, string>& attributes) {
+  std::unordered_set<string> unfulfilled_keys;
+  for (const auto& item : attributes) {
+    unfulfilled_keys.insert(item.first);
+  }
+
+  for (const auto& attr : debug_node->op_def().attr()) {
+    if (attributes.find(attr.name()) != attributes.end()) {
+      const string& attr_value = attributes.at(attr.name());
+      if (attr.type() == "string") {
+        debug_node->AddAttr<string>(attr.name(), attr_value);
+      } else if (attr.type() == "float") {
+        float float_value = 0.0;
+        if (!::tensorflow::strings::safe_strtof(attr_value.c_str(),
+                                                &float_value)) {
+          return errors::InvalidArgument(
+              "Invalid value string for float-type attribute ", attr.name(),
+              "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
+        }
+        debug_node->AddAttr<float>(attr.name(), float_value);
+      } else if (attr.type() == "int") {
+        int64 int_value = 0;
+        if (!::tensorflow::strings::safe_strto64(attr_value, &int_value)) {
+          return errors::InvalidArgument(
+              "Invalid value string for int-type attribute ", attr.name(),
+              "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
+        }
+        debug_node->AddAttr<int>(attr.name(), int_value);
+      } else if (attr.type() == "bool") {
+        string bool_str = str_util::Lowercase(attr_value);
+        if (bool_str == "false" || bool_str == "f" || bool_str == "0") {
+          debug_node->AddAttr<bool>(attr.name(), false);
+        } else if (bool_str == "true" || bool_str == "t" || bool_str == "1") {
+          debug_node->AddAttr<bool>(attr.name(), true);
+        } else {
+          return errors::InvalidArgument(
+              "Invalid value string for bool-type attribute ", attr.name(),
+              "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
+        }
+      } else {
+        return errors::InvalidArgument(
+            "Unsupported type of custom attribute for debug ops: ",
+            attr.type());
+      }
+
+      unfulfilled_keys.erase(attr.name());
+    }
+  }
+
+  if (unfulfilled_keys.empty()) {
+    return Status::OK();
+  } else {
+    return errors::InvalidArgument(
+        unfulfilled_keys.size(),
+        " attribute key(s) were not valid for debug node ", debug_node->name(),
+        ": ", str_util::Join(unfulfilled_keys, ", "));
+  }
+}
+
+// static
 Status DebugNodeInserter::CreateDebugNode(
     Graph* graph, const DeviceType device_type,
     const string& src_copy_node_name, const DataType src_dt,
@@ -325,29 +439,37 @@ Status DebugNodeInserter::CreateDebugNode(
   NodeDef node_def;
   const KernelDef* kdef;
 
+  string debug_op_name_proper;
+  std::unordered_map<string, string> custom_attributes;
+  TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op_name, &debug_op_name_proper,
+                                      &custom_attributes));
+
   const string debug_node_name =
-      GetDebugNodeName(tensor_name, debug_op_num, debug_op_name);
-  auto builder = NodeDefBuilder(debug_node_name, debug_op_name)
+      GetDebugNodeName(tensor_name, debug_op_num, debug_op_name_proper);
+  auto builder = NodeDefBuilder(debug_node_name, debug_op_name_proper)
                      .Input(src_copy_node_name, 0, src_dt)
                      .Attr("tensor_name", tensor_name)
                      .Attr("debug_urls", debug_urls);
 
   if (!builder.Finalize(&node_def).ok()) {
-    return Status(
-        error::FAILED_PRECONDITION,
-        strings::StrCat("Failed to create node definition ", "for debug op ",
-                        debug_op_name, " on watched tensor ", tensor_name));
+    return errors::FailedPrecondition(
+        "Failed to create node definition for debug op ", debug_op_name_proper,
+        " on watched tensor ", tensor_name);
   }
   if (!FindKernelDef(device_type, node_def, &kdef, nullptr).ok()) {
-    return Status(
-        error::FAILED_PRECONDITION,
-        strings::StrCat("Failed to find kernel definition ", "for debug op ",
-                        debug_op_name, " on watched tensor ", tensor_name));
+    return errors::FailedPrecondition(
+        "Failed to find kernel definition for debug op ", debug_op_name_proper,
+        " on watched tensor ", tensor_name);
   }
   if (!NodeBuilder(builder).Finalize(graph, debug_node).ok()) {
-    return Status(error::FAILED_PRECONDITION,
-                  strings::StrCat("Failed to create debug node ", debug_op_name,
-                                  " on watched tensor ", tensor_name));
+    return errors::FailedPrecondition("Failed to create debug node ",
+                                      debug_op_name_proper,
+                                      " on watched tensor ", tensor_name);
+  }
+
+  // Set custom attributes (if any).
+  if (!custom_attributes.empty()) {
+    TF_RETURN_IF_ERROR(SetDebugNodeAttributes(*debug_node, custom_attributes));
   }
 
   return Status::OK();
diff --git a/tensorflow/core/debug/debug_graph_utils.h b/tensorflow/core/debug/debug_graph_utils.h
index 6edd26c260..015149a64e 100644
--- a/tensorflow/core/debug/debug_graph_utils.h
+++ b/tensorflow/core/debug/debug_graph_utils.h
@@ -121,6 +121,19 @@ class DebugNodeInserter {
                                const int src_output, const DataType src_dt,
                                const string& tensor_name, Node** copy_node);
 
+  // Parse the debug_op_name string to extract proper op name and attributes.
+  // debug_op_name can be the proper op name only, e.g., "DebugNumericSummary".
+  // It can also contain customizable keys and values. Each key-value pair is
+  // connected with an equal sign ("="). Multiple key-value pairs are separated
+  // with semicolons (";"), which optional whitespace in between, e.g.,
+  // "DebugNumericSummary(mute_if_healthy=true, lower_bound=-100.0)".
+  static Status ParseDebugOpName(
+      const string& debug_op_name, string* debug_op_name_proper,
+      std::unordered_map<string, string>* attributes);
+
+  static Status SetDebugNodeAttributes(
+      Node* debug_node, const std::unordered_map<string, string>& attributes);
+
   static Status CreateDebugNode(Graph* graph, const DeviceType device_type,
                                 const string& src_copy_node_name,
                                 const DataType src_dt,
@@ -128,6 +141,8 @@ class DebugNodeInserter {
                                 const std::vector<string>& debug_urls,
                                 const int debug_op_num,
                                 const string& debug_op_name, Node** debug_node);
+
+  friend class DebugGraphUtilsTest;
 };
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/debug/debug_graph_utils_test.cc b/tensorflow/core/debug/debug_graph_utils_test.cc
new file mode 100644
index 0000000000..b3305e84a0
--- /dev/null
+++ b/tensorflow/core/debug/debug_graph_utils_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/debug/debug_graph_utils.h"
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+class DebugGraphUtilsTest : public ::testing::Test {
+ protected:
+  Status ParseDebugOpName(const string& debug_op_name,
+                          string* debug_op_name_proper,
+                          std::unordered_map<string, string>* attributes) {
+    return DebugNodeInserter::ParseDebugOpName(
+        debug_op_name, debug_op_name_proper, attributes);
+  }
+};
+
+TEST_F(DebugGraphUtilsTest, TestParseNoAttributeDebugOpName) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+  TF_ASSERT_OK(
+      ParseDebugOpName("DebugIdentity", &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugIdentity", debug_op_name_proper);
+  ASSERT_EQ(0, attributes.size());
+}
+
+TEST_F(DebugGraphUtilsTest, TestMalformedDebugOpName) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+
+  Status s = ParseDebugOpName("(mute_if_healthy=true)", &debug_op_name_proper,
+                              &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary(", &debug_op_name_proper,
+                       &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary)", &debug_op_name_proper,
+                       &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+}
+
+TEST_F(DebugGraphUtilsTest, TestDebugOpNameWithMalformedAttributes) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+
+  Status s = ParseDebugOpName("DebugNumericSummary(=)", &debug_op_name_proper,
+                              &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary(mute_if_healthy=)",
+                       &debug_op_name_proper, &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary(=true)", &debug_op_name_proper,
+                       &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary(mute_if_healthy:true)",
+                       &debug_op_name_proper, &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName("DebugNumericSummary(mute_if_healthy=true;threshold=)",
+                       &debug_op_name_proper, &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+
+  s = ParseDebugOpName(
+      "DebugNumericSummary(mute_if_healthy=true;threshold:300.0)",
+      &debug_op_name_proper, &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+}
+
+TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithSingleAttribute) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+
+  TF_ASSERT_OK(ParseDebugOpName("DebugNumericSummary()", &debug_op_name_proper,
+                                &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(0, attributes.size());
+
+  attributes.clear();
+  TF_ASSERT_OK(ParseDebugOpName("DebugNumericSummary(mute_if_healthy=true)",
+                                &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(1, attributes.size());
+  ASSERT_EQ("true", attributes["mute_if_healthy"]);
+}
+
+TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreThanOneAttributes) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+  TF_ASSERT_OK(ParseDebugOpName(
+      "DebugNumericSummary(mute_if_healthy=true; threshold=300.0)",
+      &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(2, attributes.size());
+  ASSERT_EQ("true", attributes["mute_if_healthy"]);
+  ASSERT_EQ("300.0", attributes["threshold"]);
+
+  attributes.clear();
+  TF_ASSERT_OK(ParseDebugOpName(
+      "DebugNumericSummary(mute_if_healthy=true;threshold=300.0;first_n=100)",
+      &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(3, attributes.size());
+  ASSERT_EQ("true", attributes["mute_if_healthy"]);
+  ASSERT_EQ("300.0", attributes["threshold"]);
+  ASSERT_EQ("100", attributes["first_n"]);
+}
+
+TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreDuplicatettributes) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+  Status s = ParseDebugOpName(
+      "DebugNumericSummary(mute_if_healthy=true; lower_bound=3; "
+      "mute_if_healthy=false;)",
+      &debug_op_name_proper, &attributes);
+  ASSERT_EQ(errors::Code::INVALID_ARGUMENT, s.code());
+}
+
+TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithWhitespaceInAttributes) {
+  string debug_op_name_proper;
+  std::unordered_map<string, string> attributes;
+
+  TF_ASSERT_OK(ParseDebugOpName(
+      "DebugNumericSummary(  mute_if_healthy=true; threshold=300.0  )",
+      &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(2, attributes.size());
+  ASSERT_EQ("true", attributes["mute_if_healthy"]);
+  ASSERT_EQ("300.0", attributes["threshold"]);
+
+  attributes.clear();
+  TF_ASSERT_OK(ParseDebugOpName(
+      "DebugNumericSummary(;;mute_if_healthy=true; threshold=300.0;;)",
+      &debug_op_name_proper, &attributes));
+  ASSERT_EQ("DebugNumericSummary", debug_op_name_proper);
+  ASSERT_EQ(2, attributes.size());
+  ASSERT_EQ("true", attributes["mute_if_healthy"]);
+  ASSERT_EQ("300.0", attributes["threshold"]);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index bd42997732..5437bc5a33 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -168,6 +168,10 @@ class DebugNumericSummaryOp : public OpKernel {
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
     OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
+    OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_));
+    OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("mute_if_healthy", &mute_if_healthy_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -196,6 +200,9 @@ class DebugNumericSummaryOp : public OpKernel {
       const T* input_flat = input.template flat<T>().data();
 
       element_count = input_shape.num_elements();
+      const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_);
+      const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_);
+
       for (int64 i = 0; i < element_count; ++i) {
         const double x = static_cast<double>(input_flat[i]);
         if (Eigen::numext::isnan(x)) {
@@ -207,7 +214,11 @@ class DebugNumericSummaryOp : public OpKernel {
             positive_inf_count++;
           }
         } else {
-          if (x < 0.0) {
+          if (is_lower_bound_custom && x <= lower_bound_) {
+            negative_inf_count++;
+          } else if (is_upper_bound_custom && x >= upper_bound_) {
+            positive_inf_count++;
+          } else if (x < 0.0) {
             negative_count++;
           } else if (x > 0.0) {
             positive_count++;
@@ -259,7 +270,9 @@ class DebugNumericSummaryOp : public OpKernel {
     output_tensor->vec<double>()(10) = mean;
     output_tensor->vec<double>()(11) = variance;
 
-    if (!debug_urls_.empty()) {
+    bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 &&
+                positive_inf_count == 0;
+    if (!mute && !debug_urls_.empty()) {
       // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
       DebugIO::PublishDebugTensor(tensor_name_, "DebugNumericSummary",
                                   *output_tensor, Env::Default()->NowMicros(),
@@ -273,6 +286,9 @@ class DebugNumericSummaryOp : public OpKernel {
  private:
   string tensor_name_;
   std::vector<string> debug_urls_;
+  float lower_bound_;
+  float upper_bound_;
+  bool mute_if_healthy_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc
index 152afc12a1..917d4c5299 100644
--- a/tensorflow/core/kernels/debug_ops_test.cc
+++ b/tensorflow/core/kernels/debug_ops_test.cc
@@ -485,5 +485,92 @@ TEST_F(DebugNumericSummaryOpTest, BoolSuccess) {
   test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
 }
 
+// Tests for DebugNumericSummaryOp
+class DebugNumericSummaryOpCustomLowerBoundTest : public OpsTestBase {
+ protected:
+  Status Init(DataType input_type) {
+    TF_CHECK_OK(NodeDefBuilder("op", "DebugNumericSummary")
+                    .Input(FakeInput(input_type))
+                    .Attr("tensor_name", "FakeTensor:0")
+                    .Attr("lower_bound", -1.2f)
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(DebugNumericSummaryOpCustomLowerBoundTest, Float_full_house) {
+  TF_ASSERT_OK(Init(DT_FLOAT));
+  AddInputFromArray<float>(
+      TensorShape({18}),
+      {std::numeric_limits<float>::quiet_NaN(),
+       std::numeric_limits<float>::quiet_NaN(), 0.0f, 0.0f, 0.0f, -1.0f, -3.0f,
+       3.0f, 7.0f, -std::numeric_limits<float>::infinity(),
+       -std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::quiet_NaN(),
+       std::numeric_limits<float>::quiet_NaN()});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({12}));
+  test::FillValues<double>(
+      &expected,
+      {1.0,              // Is initialized.
+       18.0,             // Total element count.
+       4.0,              // nan count.
+       3.0,              // -inf count.
+       1.0,              // negative number count (excluding -inf).
+       3.0,              // zero count.
+       2.0,              // positive number count (excluding +inf).
+       5.0,              // +inf count.
+       -3.0,             // minimum of non-inf and non-nan elements.
+       7.0,              // maximum of non-inf and non-nan elements.
+       0.85714285714,    // mean of non-inf and non-nan elements.
+       8.97959183673});  // variance of non-inf and non-nan elements.
+
+  test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
+}
+
+// Tests for DebugNumericSummaryOp
+class DebugNumericSummaryOpCustomLowerUpperBoundsTest : public OpsTestBase {
+ protected:
+  Status Init(DataType input_type) {
+    TF_CHECK_OK(NodeDefBuilder("op", "DebugNumericSummary")
+                    .Input(FakeInput(input_type))
+                    .Attr("tensor_name", "FakeTensor:0")
+                    .Attr("lower_bound", -0.5f)
+                    .Attr("upper_bound", 3.6f)
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(DebugNumericSummaryOpCustomLowerUpperBoundsTest, Int32Success) {
+  TF_ASSERT_OK(Init(DT_INT32));
+  AddInputFromArray<int32>(TensorShape({2, 3}), {0, 0, -1, 3, 3, 7});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_DOUBLE, TensorShape({12}));
+  test::FillValues<double>(
+      &expected,
+      {1.0,              // Is initialized.
+       6.0,              // Total element count.
+       0.0,              // nan count.
+       1.0,              // -inf count.
+       0.0,              // negative count (excluding -inf).
+       2.0,              // zero count.
+       2.0,              // positive count (excluding +inf).
+       1.0,              // +inf count.
+       -1.0,             // minimum of non-inf and non-nan elements.
+       7.0,              // maximum of non-inf and non-nan elements.
+       2.0,              // mean of non-inf and non-nan elements.
+       7.33333333333});  // variance of non-inf and non-nan elements.
+
+  test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/debug_ops.cc b/tensorflow/core/ops/debug_ops.cc
index 66dbd9cc63..63f6b60584 100644
--- a/tensorflow/core/ops/debug_ops.cc
+++ b/tensorflow/core/ops/debug_ops.cc
@@ -101,6 +101,9 @@ REGISTER_OP("DebugNumericSummary")
     .Attr("T: type")
     .Attr("tensor_name: string = ''")
     .Attr("debug_urls: list(string) = []")
+    .Attr("lower_bound: float = -inf")
+    .Attr("upper_bound: float = inf")
+    .Attr("mute_if_healthy: bool = false")
     .SetAllowsUninitializedInput()
     .Doc(R"doc(
 Debug Numeric Summary Op.
@@ -111,12 +114,16 @@ input: Input tensor, non-Reference type, float or double.
 output: A double tensor of shape [12], the elements of which are:
   [0]: is initialized (1.0) or not (0.0).
   [1]: total number of elements
-  [2]: -inf count
-  [3]: negative element count (excluding -inf)
-  [4]: zero element count
-  [5]: positive element count (excluding +inf)
-  [6]: +inf element count
-  [7]: NaN element count
+  [2]: NaN element count
+  [3]: generalized -inf count: elements <= lower_bound. lower_bound is -inf by
+    default.
+  [4]: negative element count (excluding -inf), if lower_bound is the default
+    -inf. Otherwise, this is the count of elements > lower_bound and < 0.
+  [5]: zero element count
+  [6]: positive element count (excluding +inf), if upper_bound is the default
+    -inf. Otherwise, this is the count of elements < upper_bound and > 0.
+  [7]: generalized +inf count, elements >= upper_bound. upper_bound is +inf by
+    default.
 Output elements [1:8] are all zero, if the tensor is uninitialized.
   [8]: minimum of all non-inf and non-NaN elements.
        If uninitialized or no such element exists: +inf.
@@ -129,7 +136,15 @@ Output elements [1:8] are all zero, if the tensor is uninitialized.
 
 tensor_name: Name of the input tensor.
 debug_urls: List of URLs to debug targets, e.g.,
-            file:///foo/tfdbg_dump, grpc:://localhost:11011
+  file:///foo/tfdbg_dump, grpc:://localhost:11011
+lower_bound: (float) The lower bound <= which values will be included in the
+  generalized -inf count. Default: -inf.
+upper_bound: (float) The upper bound >= which values will be included in the
+  generalized +inf count. Default: +inf.
+mute_if_healthy: (bool) Do not send data to the debug URLs unless at least one
+  of elements [2], [3] and [7] (i.e., the nan count and the generalized -inf and
+  inf counts) is non-zero.
+
 )doc");
 
-}  // namespace tensorflow
-\ No newline at end of file
+}  // namespace tensorflow
diff --git a/tensorflow/python/debug/lib/debug_utils.py b/tensorflow/python/debug/lib/debug_utils.py
index 7163936631..1a15c0391d 100644
--- a/tensorflow/python/debug/lib/debug_utils.py
+++ b/tensorflow/python/debug/lib/debug_utils.py
@@ -42,6 +42,9 @@ def add_debug_tensor_watch(run_options,
     debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s). Can be a
       `list` of `str` or a single `str`. The latter case is equivalent to a
       `list` of `str` with only one element.
+      For debug op types with customizable attributes, each debug op string can
+      optionally contain a list of attribute names, in the syntax of:
+        debug_op_name(attr_name_1=attr_value_1;attr_name_2=attr_value_2;...)
     debug_urls: (`str` or `list` of `str`) URL(s) to send debug values to,
       e.g., `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
     tolerate_debug_op_creation_failures: (`bool`) Whether to tolerate debug op
@@ -97,6 +100,9 @@ def watch_graph(run_options,
       a single string, or None. The case of a single string is equivalent to
       a list consisting of a single string, e.g., `file:///tmp/tfdbg_dump_1`,
       `grpc://localhost:12345`.
+      For debug op types with customizable attributes, each debug op name string
+      can optionally contain a list of attribute names, in the syntax of:
+        debug_op_name(attr_name_1=attr_value_1;attr_name_2=attr_value_2;...)
     node_name_regex_whitelist: Regular-expression whitelist for node_name,
       e.g., `"(weight_[0-9]+|bias_.*)"`
     op_type_regex_whitelist: Regular-expression whitelist for the op type of
@@ -178,6 +184,7 @@ def watch_graph_with_blacklists(run_options,
     run_options: An instance of `config_pb2.RunOptions` to be modified.
     graph: An instance of `ops.Graph`.
     debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
+      See the documentation of `watch_graph` for more details.
     debug_urls: URL(s) to send debug values to, e.g.,
       `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
     node_name_regex_blacklist: Regular-expression blacklist for node_name.
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index fd4b4aecd6..d733a5e210 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -1146,6 +1146,139 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       self.assertIn("n:0:DebugNumericSummary", dump.debug_watch_keys("n"))
       self.assertIn("m:0:DebugNumericSummary", dump.debug_watch_keys("m"))
 
+  def testDebugNumericSummaryInvalidAttributesStringAreCaught(self):
+    with session.Session() as sess:
+      a = variables.Variable(10.0, name="a")
+      b = variables.Variable(0.0, name="b")
+      c = variables.Variable(0.0, name="c")
+
+      x = math_ops.divide(a, b, name="x")
+      y = math_ops.multiply(x, c, name="y")
+
+      sess.run(variables.global_variables_initializer())
+
+      run_metadata = config_pb2.RunMetadata()
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugNumericSummary(foo=1.0)"],
+          debug_urls=self._debug_urls())
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          r"1 attribute key\(s\) were not valid for debug node "
+          r"__dbg_a:0_0_DebugNumericSummary: foo"):
+        sess.run(y, options=run_options, run_metadata=run_metadata)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugNumericSummary(foo=1.0; bar=false)"],
+          debug_urls=self._debug_urls())
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          r"2 attribute key\(s\) were not valid for debug node "
+          r"__dbg_a:0_0_DebugNumericSummary:"):
+        sess.run(y, options=run_options, run_metadata=run_metadata)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugNumericSummary(foo=1.0; mute_if_healthy=true)"],
+          debug_urls=self._debug_urls())
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          r"1 attribute key\(s\) were not valid for debug node "
+          r"__dbg_a:0_0_DebugNumericSummary: foo"):
+        sess.run(y, options=run_options, run_metadata=run_metadata)
+
+  def testDebugNumericSummaryMuteOnHealthyMutesOnlyHealthyTensorDumps(self):
+    with session.Session() as sess:
+      a = variables.Variable(10.0, name="a")
+      b = variables.Variable(0.0, name="b")
+      c = variables.Variable(0.0, name="c")
+
+      x = math_ops.divide(a, b, name="x")
+      y = math_ops.multiply(x, c, name="y")
+
+      sess.run(variables.global_variables_initializer())
+
+      run_metadata = config_pb2.RunMetadata()
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugNumericSummary(mute_if_healthy=true)"],
+          debug_urls=self._debug_urls())
+      sess.run(y, options=run_options, run_metadata=run_metadata)
+
+      dump = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs,
+          validate=False)
+      # Here, validate=False is necessary to avoid causality check error.
+      # TODO(cais): Maybe let DebugDumpDir constructor automatically ignore
+      #   debug ops with mute_if_healthy=false attribute during validation.
+
+      self.assertEqual(2, dump.size)
+      self.assertAllClose(
+          [[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, np.inf, -np.inf, np.nan,
+            np.nan]],
+          dump.get_tensors("x", 0, "DebugNumericSummary"))
+      self.assertAllClose(
+          [[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, np.inf, -np.inf, np.nan,
+            np.nan]],
+          dump.get_tensors("y", 0, "DebugNumericSummary"))
+
+      # Another run with the default mute_if_healthy (false) value should
+      # dump all the tensors.
+      shutil.rmtree(self._dump_root)
+      run_metadata = config_pb2.RunMetadata()
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=["DebugNumericSummary()"],
+          debug_urls=self._debug_urls())
+      sess.run(y, options=run_options, run_metadata=run_metadata)
+
+      dump = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+      self.assertEqual(8, dump.size)
+
+  def testDebugNumericSummaryMuteOnHealthyAndCustomBoundsWork(self):
+    with session.Session() as sess:
+      a = variables.Variable([10.0, 10.0], name="a")
+      b = variables.Variable([10.0, 2.0], name="b")
+
+      x = math_ops.add(a, b, name="x")  # [20.0, 12.0]
+      y = math_ops.divide(x, b, name="y")  # [2.0, 6.0]
+
+      sess.run(variables.global_variables_initializer())
+
+      run_metadata = config_pb2.RunMetadata()
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options,
+          sess.graph,
+          debug_ops=[
+              "DebugNumericSummary(mute_if_healthy=true; upper_bound=11.0)"],
+          debug_urls=self._debug_urls())
+      sess.run(y, options=run_options, run_metadata=run_metadata)
+
+      dump = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs,
+          validate=False)
+      # Here, validate=False is necessary to avoid causality check error.
+      # TODO(cais): Maybe let DebugDumpDir constructor automatically ignore
+      #   debug ops with mute_if_healthy=false attribute during validation.
+
+      self.assertEqual(1, dump.size)
+      self.assertAllClose(
+          [[1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 12.0, 20.0, 16.0, 16.0]],
+          dump.get_tensors("x", 0, "DebugNumericSummary"))
+
   def testDebugQueueOpsDoesNotoErrorOut(self):
     with session.Session() as sess:
       q = data_flow_ops.FIFOQueue(3, "float", name="fifo_queue")
author	Shanqing Cai <cais@google.com>	2017-03-20 12:11:05 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-03-20 13:39:28 -0700
commit	3288f2eee7140e4a97c5976417fcbab5fe28a05c (patch)
tree	1e1f18dbaf6fe63f566a064bbbc0ef30177a162d /tensorflow
parent	a7e5032f4d5cb054d86e0c7f2b8aaab293b43d43 (diff)