From 7fe0ae12ec42eca1ea07d93bbd63de394743a018 Mon Sep 17 00:00:00 2001 From: Pradeep Banavara Date: Thu, 12 Jul 2018 21:52:13 -0400 Subject: Fix: #12686 SoftmaxCrossEntropyWithLogits Committing in a new PR as the old PR has too many commit files --- tensorflow/cc/gradients/nn_grad.cc | 94 +++++++++++++++++++++++++++++---- tensorflow/cc/gradients/nn_grad_test.cc | 29 ++++++++-- 2 files changed, 109 insertions(+), 14 deletions(-) (limited to 'tensorflow/cc') diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc index c73482d5f4..dc6477e59d 100644 --- a/tensorflow/cc/gradients/nn_grad.cc +++ b/tensorflow/cc/gradients/nn_grad.cc @@ -47,6 +47,81 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad); +bool IsZero(const Scope& scope, Output grad) { + std::array zero_op_type_names{{"ZerosLike", "Zeros"}}; + string op_type_name = grad.op().node()->type_string(); + for (auto& zero_op_type_name: zero_op_type_names) { + if (op_type_name == zero_op_type_name) { + return true; + } + } + // the Operation we were provided is not named something obvious + // we need to actually look at its contents. + // the original python code did this by calling a utility function called + // tensor_util.constant_value. When you dig into tensor_tuil.constant_value + // it is a large number of 'if' statements that measure certain edge cases + // where it is possible to get the value of the tensor without actually + // evaluating it. There are many kinds of tensors that can not have this + // done. + // There is no C++ equivalent to tensor_util.constant_value so we do nothing + // for the moment. + return false; +} + +Output BroadcastMul(const Scope& scope, Output vec, Output mat) { + /* Multiply after broadcasting vec to match dimensions of mat. + Args: + vec: A 1-D tensor of dimension [D0] + mat: A 2-D tensor of dimesnion [D0, D1] + + Returns: + A tensor of dimension [D0, D1], the result fo vec * mat + we use an element for element multiply here. + */ + auto reshaped = ExpandDims(scope, vec, -1); + return Multiply(scope, reshaped, mat); +} + +Status SoftmaxCrossEntropyWithLogitsGrad(const Scope& scope, + const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + // Softmax gradient with cross entropy logits function + // We multiply the backprop for cost with the gradients - op.output[1] + // There is no gradient for labels + auto logits = + op.input(0); // the outputs of the network are at + // input index 0. The "truth" labels are at index 1. + auto softmax_grad = op.output(1); + + // The documentation for ops::SoftmaxCrossEntropyWithLogits says + // loss is the output at index 0, and backprop is the output at index 1 + auto grad_loss = grad_inputs[0]; + auto grad_grad = grad_inputs[1]; + + auto grad = BroadcastMul(scope, grad_loss, softmax_grad); + if (!IsZero(scope, grad_grad)) { + std::vector axis; + auto logitsSoftmax = Softmax(scope, logits); + + auto grad_gradExpand = ExpandDims(scope, grad_grad, 1); + auto logitsSoftMaxExpand = ExpandDims(scope, logitsSoftmax, 2); + auto matMulResult = + BatchMatMul(scope, grad_gradExpand, logitsSoftMaxExpand); + axis.push_back(1); + auto squeezeResult = Squeeze(scope, matMulResult, Squeeze::Axis(axis)); + auto subtractionResult = Subtract(scope, grad_grad, squeezeResult); + auto multiplyResult = Multiply(scope, subtractionResult, logitsSoftmax); + grad = Add(scope, grad, multiplyResult); + } + auto minusLogSoftmax = Multiply(scope, LogSoftmax(scope, logits), -1.0f); + grad_outputs->push_back(grad); + grad_outputs->push_back(BroadcastMul(scope, grad_loss, minusLogSoftmax)); + return scope.status(); +} +REGISTER_GRADIENT_OP("SoftmaxCrossEntropyWithLogits", + SoftmaxCrossEntropyWithLogitsGrad); + Status LogSoftmaxGrad(const Scope& scope, const Operation& op, const std::vector& grad_inputs, std::vector* grad_outputs) { @@ -195,9 +270,9 @@ Status MaxPool3DGradHelper(const Scope& scope, const Operation& op, TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding)); TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); MaxPool3DGrad::Attrs grad_attrs; - auto dx = MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0], - ksize, strides, padding, - grad_attrs.DataFormat(data_format)); + auto dx = + MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0], ksize, + strides, padding, grad_attrs.DataFormat(data_format)); grad_outputs->push_back(dx); return scope.status(); } @@ -216,10 +291,9 @@ Status AvgPoolGradHelper(const Scope& scope, const Operation& op, TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding)); TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); internal::AvgPoolGrad::Attrs grad_attrs; - auto dx = - internal::AvgPoolGrad(scope, Shape(scope, op.input(0)), grad_inputs[0], - ksize, strides, padding, - grad_attrs.DataFormat(data_format)); + auto dx = internal::AvgPoolGrad(scope, Shape(scope, op.input(0)), + grad_inputs[0], ksize, strides, padding, + grad_attrs.DataFormat(data_format)); grad_outputs->push_back(dx); return scope.status(); } @@ -238,9 +312,9 @@ Status AvgPool3DGradHelper(const Scope& scope, const Operation& op, TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding)); TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format)); AvgPool3DGrad::Attrs grad_attrs; - auto dx = AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0], - ksize, strides, padding, - grad_attrs.DataFormat(data_format)); + auto dx = + AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0], ksize, + strides, padding, grad_attrs.DataFormat(data_format)); grad_outputs->push_back(dx); return scope.status(); } diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc index b4d457a9d1..f26a7e99e6 100644 --- a/tensorflow/cc/gradients/nn_grad_test.cc +++ b/tensorflow/cc/gradients/nn_grad_test.cc @@ -25,6 +25,8 @@ limitations under the License. namespace tensorflow { namespace { +using ops::AvgPool; +using ops::AvgPool3D; using ops::BiasAdd; using ops::Conv2D; using ops::Elu; @@ -33,11 +35,9 @@ using ops::FractionalMaxPool; using ops::L2Loss; using ops::LogSoftmax; using ops::LRN; -using ops::AvgPool; -using ops::AvgPool3D; using ops::MaxPool; -using ops::MaxPoolV2; using ops::MaxPool3D; +using ops::MaxPoolV2; using ops::Placeholder; using ops::Relu; using ops::Relu6; @@ -111,6 +111,27 @@ TEST_F(NNGradTest, SoftmaxGrad) { RunTest(x, shape, y, shape); } +TEST_F(NNGradTest, SoftmaxCrossEntropyWithLogitsGrad) { + TensorShape logitsShape( + {5, 3}); // batch size of 5,3 possible labels (classes), + // logits is what is produced by a network + // they are compared to labels which are the truth + TensorShape lossShape( + {5}); // batch size of 5, 1 value for each entry in the batch + // loss is the difference between logits and labels + + auto logits = Placeholder(scope_, DT_FLOAT, + Placeholder::Shape(logitsShape)); // estimation + auto labels = + Placeholder(scope_, DT_FLOAT, Placeholder::Shape(logitsShape)); // truth + auto y = + tensorflow::ops::SoftmaxCrossEntropyWithLogits(scope_, logits, labels); + // Please note the reversal of the backprop and loss orders. A separate issue + // #18734 has been opened for this. + RunTest({logits, labels}, {logitsShape, logitsShape}, {y.backprop, y.loss}, + {logitsShape, lossShape}); +} + TEST_F(NNGradTest, LogSoftmaxGrad) { TensorShape shape({5, 3}); auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape)); @@ -253,7 +274,7 @@ TEST_F(NNGradTest, AvgPool3DGradHelper) { RunTest(x, x_shape, y, y_shape); } -TEST_F(NNGradTest, LRN){ +TEST_F(NNGradTest, LRN) { TensorShape x_shape({1, 1, 2, 1}); auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); auto y = LRN(scope_, x); -- cgit v1.2.3