From 7fe0ae12ec42eca1ea07d93bbd63de394743a018 Mon Sep 17 00:00:00 2001
From: Pradeep Banavara <pradeepbs@gmail.com>
Date: Thu, 12 Jul 2018 21:52:13 -0400
Subject: Fix: #12686 SoftmaxCrossEntropyWithLogits

Committing in a new PR as the old PR has too many commit files
---
 tensorflow/cc/gradients/nn_grad.cc      | 94 +++++++++++++++++++++++++++++----
 tensorflow/cc/gradients/nn_grad_test.cc | 29 ++++++++--
 2 files changed, 109 insertions(+), 14 deletions(-)

(limited to 'tensorflow/cc')
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index c73482d5f4..dc6477e59d 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -47,6 +47,81 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);
 
+bool IsZero(const Scope& scope, Output grad) {
+  std::array<std::string, 2> zero_op_type_names{{"ZerosLike", "Zeros"}};
+  string op_type_name = grad.op().node()->type_string();
+  for (auto& zero_op_type_name: zero_op_type_names) {
+    if (op_type_name == zero_op_type_name) {
+      return true;
+    }
+  }
+  // the Operation we were provided is not named something obvious
+  // we need to actually look at its contents.
+  // the original python code did this by calling a utility function called
+  // tensor_util.constant_value. When you dig into tensor_tuil.constant_value
+  // it is a large number of 'if' statements that measure certain edge cases
+  // where it is possible to get the value of the tensor without actually
+  // evaluating it. There are many kinds of tensors that can not have this
+  // done.
+  // There is no C++ equivalent to tensor_util.constant_value so we do nothing
+  // for the moment.
+  return false;
+}
+
+Output BroadcastMul(const Scope& scope, Output vec, Output mat) {
+  /* Multiply after broadcasting vec to match dimensions of mat.
+     Args:
+       vec: A 1-D tensor of dimension [D0]
+       mat: A 2-D tensor of dimesnion [D0, D1]
+
+    Returns:
+      A tensor of dimension [D0, D1], the result fo vec * mat
+      we use an element for element multiply here.
+  */
+  auto reshaped = ExpandDims(scope, vec, -1);
+  return Multiply(scope, reshaped, mat);
+}
+
+Status SoftmaxCrossEntropyWithLogitsGrad(const Scope& scope,
+                                         const Operation& op,
+                                         const std::vector<Output>& grad_inputs,
+                                         std::vector<Output>* grad_outputs) {
+  // Softmax gradient with cross entropy logits function
+  // We multiply the backprop for cost with the gradients - op.output[1]
+  // There is no gradient for labels
+  auto logits =
+      op.input(0);  // the outputs of the network are at
+                    // input index 0. The "truth" labels are at index 1.
+  auto softmax_grad = op.output(1);
+
+  // The documentation for ops::SoftmaxCrossEntropyWithLogits says
+  // loss is the output at index 0, and backprop is the output at index 1
+  auto grad_loss = grad_inputs[0];
+  auto grad_grad = grad_inputs[1];
+
+  auto grad = BroadcastMul(scope, grad_loss, softmax_grad);
+  if (!IsZero(scope, grad_grad)) {
+    std::vector<int> axis;
+    auto logitsSoftmax = Softmax(scope, logits);
+
+    auto grad_gradExpand = ExpandDims(scope, grad_grad, 1);
+    auto logitsSoftMaxExpand = ExpandDims(scope, logitsSoftmax, 2);
+    auto matMulResult =
+        BatchMatMul(scope, grad_gradExpand, logitsSoftMaxExpand);
+    axis.push_back(1);
+    auto squeezeResult = Squeeze(scope, matMulResult, Squeeze::Axis(axis));
+    auto subtractionResult = Subtract(scope, grad_grad, squeezeResult);
+    auto multiplyResult = Multiply(scope, subtractionResult, logitsSoftmax);
+    grad = Add(scope, grad, multiplyResult);
+  }
+  auto minusLogSoftmax = Multiply(scope, LogSoftmax(scope, logits), -1.0f);
+  grad_outputs->push_back(grad);
+  grad_outputs->push_back(BroadcastMul(scope, grad_loss, minusLogSoftmax));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("SoftmaxCrossEntropyWithLogits",
+                     SoftmaxCrossEntropyWithLogitsGrad);
+
 Status LogSoftmaxGrad(const Scope& scope, const Operation& op,
                       const std::vector<Output>& grad_inputs,
                       std::vector<Output>* grad_outputs) {
@@ -195,9 +270,9 @@ Status MaxPool3DGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   MaxPool3DGrad::Attrs grad_attrs;
-  auto dx = MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0],
-                          ksize, strides, padding,
-                          grad_attrs.DataFormat(data_format));
+  auto dx =
+      MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0], ksize,
+                    strides, padding, grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -216,10 +291,9 @@ Status AvgPoolGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   internal::AvgPoolGrad::Attrs grad_attrs;
-  auto dx =
-      internal::AvgPoolGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
-                            ksize, strides, padding,
-                            grad_attrs.DataFormat(data_format));
+  auto dx = internal::AvgPoolGrad(scope, Shape(scope, op.input(0)),
+                                  grad_inputs[0], ksize, strides, padding,
+                                  grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -238,9 +312,9 @@ Status AvgPool3DGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   AvgPool3DGrad::Attrs grad_attrs;
-  auto dx = AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
-                          ksize, strides, padding,
-                          grad_attrs.DataFormat(data_format));
+  auto dx =
+      AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0], ksize,
+                    strides, padding, grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index b4d457a9d1..f26a7e99e6 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -25,6 +25,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ops::AvgPool;
+using ops::AvgPool3D;
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
@@ -33,11 +35,9 @@ using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
-using ops::AvgPool;
-using ops::AvgPool3D;
 using ops::MaxPool;
-using ops::MaxPoolV2;
 using ops::MaxPool3D;
+using ops::MaxPoolV2;
 using ops::Placeholder;
 using ops::Relu;
 using ops::Relu6;
@@ -111,6 +111,27 @@ TEST_F(NNGradTest, SoftmaxGrad) {
   RunTest(x, shape, y, shape);
 }
 
+TEST_F(NNGradTest, SoftmaxCrossEntropyWithLogitsGrad) {
+  TensorShape logitsShape(
+      {5, 3});  // batch size of 5,3 possible labels (classes),
+                // logits is what is produced by a network
+                // they are compared to labels which are the truth
+  TensorShape lossShape(
+      {5});  // batch size of 5, 1 value for each entry in the batch
+             // loss is the difference between logits and labels
+
+  auto logits = Placeholder(scope_, DT_FLOAT,
+                            Placeholder::Shape(logitsShape));  // estimation
+  auto labels =
+      Placeholder(scope_, DT_FLOAT, Placeholder::Shape(logitsShape));  // truth
+  auto y =
+      tensorflow::ops::SoftmaxCrossEntropyWithLogits(scope_, logits, labels);
+  // Please note the reversal of the backprop and loss orders. A separate issue
+  // #18734 has been opened for this.
+  RunTest({logits, labels}, {logitsShape, logitsShape}, {y.backprop, y.loss},
+          {logitsShape, lossShape});
+}
+
 TEST_F(NNGradTest, LogSoftmaxGrad) {
   TensorShape shape({5, 3});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
@@ -253,7 +274,7 @@ TEST_F(NNGradTest, AvgPool3DGradHelper) {
   RunTest(x, x_shape, y, y_shape);
 }
 
-TEST_F(NNGradTest, LRN){
+TEST_F(NNGradTest, LRN) {
   TensorShape x_shape({1, 1, 2, 1});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
   auto y = LRN(scope_, x);
-- 
cgit v1.2.3