From f410ffc1699e864e84857089183db0d952ada7fe Mon Sep 17 00:00:00 2001
From: Andreas Madsen <amwebdk@gmail.com>
Date: Thu, 26 Jul 2018 15:44:39 +0200
Subject: make sparsemax nan and infinity safe

logits that are -inf will be given 0 probability and logits that are
inf will result in a nan output. Likewise if all logits are -inf the
output will also be nan.

This is done by using where operators, mostly because 0 * inf = nan
and x/0 = sign(x) inf following the IEEE 754 standard. However these
results are not mathematically correct in the context of the sparsemax
algorithm.

Fixes: https://github.com/tensorflow/tensorflow/issues/15564
---
 .../python/kernel_tests/sparsemax_loss_test.py     | 64 ++++++++++++++++++++++
 .../python/kernel_tests/sparsemax_test.py          | 63 ++++++++++++++++++++-
 .../contrib/sparsemax/python/ops/sparsemax.py      | 30 +++++++++-
 .../contrib/sparsemax/python/ops/sparsemax_loss.py | 32 +++++++++--
 4 files changed, 178 insertions(+), 11 deletions(-)

(limited to 'tensorflow/contrib')

diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
index 360e7dbe75..2db76a6d56 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
@@ -109,6 +109,66 @@ class SparsemaxLossTest(test.TestCase):
         np_loss, tf_loss_out, half_atol=1e-2, half_rtol=5e-3)
     self.assertShapeEqual(np_loss, tf_loss_op)
 
+  def _test_sparsemax_loss_of_nan(self, dtype, random, use_gpu):
+    """check sparsemax-loss transfers nan"""
+    q = np.asarray([
+        [0, 0, 1],
+        [0, 0, 1],
+        [0, 0, 1]
+    ])
+    z_nan = np.asarray([
+        [0, np.nan, 0],
+        [0, np.nan, np.nan],
+        [np.nan, np.nan, np.nan]
+    ]).astype(dtype)
+
+    _, tf_loss_nan = self._tf_sparsemax_loss(z_nan, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [np.nan, np.nan, np.nan],
+        tf_loss_nan)
+
+  def _test_sparsemax_loss_of_inf(self, dtype, random, use_gpu):
+    """check sparsemax-loss is infinity safe"""
+    q = np.asarray([
+        [0, 0, 1],
+        [0, 0, 1],
+        [0, 0, 1],
+        [0, 0, 1]
+    ])
+    z_neg = np.asarray([
+        [0, -np.inf, 0],
+        [0, -np.inf, -np.inf],
+        [-np.inf, -np.inf, 0],
+        [-np.inf, -np.inf, -np.inf],
+    ]).astype(dtype)
+    z_pos = np.asarray([
+        [0, np.inf, 0],
+        [0, np.inf, np.inf],
+        [np.inf, np.inf, 0],
+        [np.inf, np.inf, np.inf]
+    ]).astype(dtype)
+    z_mix = np.asarray([
+        [0, np.inf, 0],
+        [0, np.inf, -np.inf],
+        [-np.inf, np.inf, 0],
+        [-np.inf, np.inf, -np.inf]
+    ]).astype(dtype)
+
+    _, tf_loss_neg = self._tf_sparsemax_loss(z_neg, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [0.25, np.inf, 0, np.nan],
+        tf_loss_neg)
+
+    _, tf_loss_pos = self._tf_sparsemax_loss(z_pos, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [np.nan, np.nan, np.nan, np.nan],
+        tf_loss_pos)
+
+    _, tf_loss_mix = self._tf_sparsemax_loss(z_mix, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [np.nan, np.nan, np.nan, np.nan],
+        tf_loss_mix)
+
   def _test_constant_add(self, dtype, random, use_gpu):
     """check sparsemax-loss proposition 3"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10))
@@ -198,6 +258,10 @@ class SparsemaxLossTest(test.TestCase):
 
     self._test_sparsemax_loss_against_numpy(dtype, random, use_gpu=False)
 
+    self._test_sparsemax_loss_of_nan(dtype, random, use_gpu=False)
+
+    self._test_sparsemax_loss_of_inf(dtype, random, use_gpu=False)
+
     self._test_constant_add(dtype, random, use_gpu=False)
 
     self._test_sparsemax_loss_positive(dtype, random, use_gpu=False)
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
index 259e62bd86..38c6dd15db 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
@@ -87,6 +87,61 @@ class SparsemaxTest(test.TestCase):
         p_sparemax, tf_sparsemax_out, half_atol=5e-3)
     self.assertShapeEqual(p_sparemax, tf_sparsemax_op)
 
+  def _test_sparsemax_of_nan(self, dtype, random, use_gpu):
+    """check sparsemax transfers nan"""
+    z_nan = np.asarray([
+        [0, np.nan, 0],
+        [0, np.nan, np.nan],
+        [np.nan, np.nan, np.nan],
+    ]).astype(dtype)
+
+    _, tf_sparsemax_nan = self._tf_sparsemax(z_nan, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan]
+    ], tf_sparsemax_nan)
+
+  def _test_sparsemax_of_inf(self, dtype, random, use_gpu):
+    """check sparsemax is infinity safe"""
+    z_neg = np.asarray([
+        [0, -np.inf, 0],
+        [0, -np.inf, -np.inf],
+        [-np.inf, -np.inf, -np.inf],
+    ]).astype(dtype)
+    z_pos = np.asarray([
+        [0, np.inf, 0],
+        [0, np.inf, np.inf],
+        [np.inf, np.inf, np.inf]
+    ]).astype(dtype)
+    z_mix = np.asarray([
+        [0, np.inf, 0],
+        [0, np.inf, -np.inf],
+        [-np.inf, np.inf, -np.inf]
+    ]).astype(dtype)
+
+    _, tf_sparsemax_neg = self._tf_sparsemax(z_neg, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([
+        [0.5, 0, 0.5],
+        [1, 0, 0],
+        [np.nan, np.nan, np.nan]
+    ], tf_sparsemax_neg)
+
+    _, tf_sparsemax_pos = self._tf_sparsemax(z_pos, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan]
+    ], tf_sparsemax_pos)
+
+    _, tf_sparsemax_mix = self._tf_sparsemax(z_mix, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan],
+        [np.nan, np.nan, np.nan]
+    ], tf_sparsemax_mix)
+
+
   def _test_sparsemax_of_zero(self, dtype, random, use_gpu):
     """check sparsemax proposition 1, part 1"""
     z = np.zeros((1, 10))
@@ -97,7 +152,7 @@ class SparsemaxTest(test.TestCase):
     self.assertAllCloseAccordingToType(p_sparemax, tf_sparsemax_out)
     self.assertShapeEqual(p_sparemax, tf_sparsemax_op)
 
-  def _test_sparsemax_of_inf(self, dtype, random, use_gpu):
+  def _test_sparsemax_of_to_inf(self, dtype, random, use_gpu):
     """check sparsemax proposition 1, part 2"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10))
 
@@ -210,10 +265,14 @@ class SparsemaxTest(test.TestCase):
 
     self._test_sparsemax_against_numpy(dtype, random, use_gpu=False)
 
-    self._test_sparsemax_of_zero(dtype, random, use_gpu=False)
+    self._test_sparsemax_of_nan(dtype, random, use_gpu=False)
 
     self._test_sparsemax_of_inf(dtype, random, use_gpu=False)
 
+    self._test_sparsemax_of_zero(dtype, random, use_gpu=False)
+
+    self._test_sparsemax_of_to_inf(dtype, random, use_gpu=False)
+
     self._test_constant_add(dtype, random, use_gpu=False)
 
     self._test_permutation(dtype, random, use_gpu=False)
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
index e617af2ff1..f903b629c7 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@@ -49,7 +49,14 @@ def sparsemax(logits, name=None):
     obs = array_ops.shape(logits)[0]
     dims = array_ops.shape(logits)[1]
 
-    z = logits - math_ops.reduce_mean(logits, axis=1)[:, array_ops.newaxis]
+    # In the paper, they call the logits z.
+    # The mean(logits) can be substracted from logits to make the algorithm
+    # more numerically stable. the instability in this algorithm comes mostly
+    # from the z_cumsum. Substacting the mean will cause z_cumsum to be close
+    # to zero. However, in practise the numerical instability issues are very
+    # minor and substacting the mean causes extra issues with inf and nan
+    # input.
+    z = logits
 
     # sort z
     z_sorted, _ = nn.top_k(z, k=dims)
@@ -64,10 +71,27 @@ def sparsemax(logits, name=None):
     k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=1)
 
     # calculate tau(z)
-    indices = array_ops.stack([math_ops.range(0, obs), k_z - 1], axis=1)
+    # If there are inf values or all values are -inf, the k_z will be zero,
+    # this is mathematically invalid and will also cause the gather_nd to fail.
+    # Prevent this issue for now by setting k_z = 1 if k_z = 0, this is then
+    # fixed later (see p_safe) by returning p = nan. This results in the same
+    # behavior as softmax.
+    k_z_safe = math_ops.maximum(k_z, 1)
+    indices = array_ops.stack([math_ops.range(0, obs), k_z_safe - 1], axis=1)
     tau_sum = array_ops.gather_nd(z_cumsum, indices)
     tau_z = (tau_sum - 1) / math_ops.cast(k_z, logits.dtype)
 
     # calculate p
-    return math_ops.maximum(
+    p = math_ops.maximum(
         math_ops.cast(0, logits.dtype), z - tau_z[:, array_ops.newaxis])
+    # If k_z = 0 or if z = nan, then the input is invalid
+    p_safe = array_ops.where(
+        math_ops.logical_or(
+            math_ops.equal(k_z, 0),
+            math_ops.is_nan(z_cumsum[:, -1])
+        ),
+        array_ops.fill([obs, dims], math_ops.cast(float('nan'), logits.dtype)),
+        p
+    )
+
+    return p_safe
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
index 582d1e6136..9095cfe267 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
@@ -47,14 +47,34 @@ def sparsemax_loss(logits, sparsemax, labels, name=None):
     sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax")
     labels = ops.convert_to_tensor(labels, name="labels")
 
-    shifted_logits = logits - \
-        math_ops.reduce_mean(logits, axis=1)[:, array_ops.newaxis]
+    # In the paper, they call the logits z.
+    # A constant can be substracted from logits to make the algorithm
+    # more numerically stable in theory. However, there are really no major
+    # source numerical instability in this algorithm.
+    z = logits
 
     # sum over support
-    support = math_ops.cast(sparsemax > 0, sparsemax.dtype)
-    sum_s = support * sparsemax * (shifted_logits - 0.5 * sparsemax)
+    # Use a conditional where instead of a multiplication to support z = -inf.
+    # If z = -inf, and there is no support (sparsemax = 0), a multiplication
+    # would cause 0 * -inf = nan, which is not correct in this case.
+    sum_s = array_ops.where(
+        math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)),
+        sparsemax * (z - 0.5 * sparsemax),
+        array_ops.zeros_like(sparsemax)
+    )
 
     # - z_k + ||q||^2
-    q_part = labels * (0.5 * labels - shifted_logits)
+    q_part = labels * (0.5 * labels - z)
+    # Fix the case where labels = 0 and z = -inf, where q_part would
+    # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for
+    # z = -inf should be consideredself.
+    # The code below also coveres the case where z = inf. Howeverm in this
+    # caose the sparsemax will be nan, which means the sum_s will also be nan,
+    # therefor this case doesn't need addtional special treatment.
+    q_part_safe = array_ops.where(
+        math_ops.logical_and(math_ops.equal(labels, 0), math_ops.is_inf(z)),
+        array_ops.zeros_like(z),
+        q_part
+    )
 
-    return math_ops.reduce_sum(sum_s + q_part, axis=1)
+    return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
-- 
cgit v1.2.3