From 09b0fc199129e0f487a39741bdf674cf09035cbc Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Mon, 8 Oct 2018 14:17:24 -0700 Subject: [tf.data] Choose non-deterministic seed once per Python-level `Dataset` object. This changes the behavior of randomness-introducing datasets (`tf.data.Dataset.shuffle()`, `tf.data.experimental.shuffle_and_repeat()`, and `tf.data.experimental.RandomDataset`). Previously, when you used the same `tf.data.Dataset` object multiple times in a pipeline (e.g. by zipping two datasets derived from the same randomness-introducing dataset) *and* you did not specify an explicit `seed`, the implementation would choose different non-deterministic seeds for each use of the `Dataset` object. With this change, the seed will be chosen once per `Dataset` (technically, once per `Dataset`-`Graph` combination, due to the vagaries of capturing state in `Dataset.make_one_shot_iterator()`), which means that all uses of the same dataset object will observe the same sequence of values. This change also revealed a small bug in how `Dataset.shuffle(..., reshuffle_each_iteration=False)` is serialized when an explicit seed is specified. The op-level seed was dropped, which could lead to non-deterministic behavior. This change fixes that issue by forwarding the op-level seed to the appropriate place. PiperOrigin-RevId: 216248013 --- tensorflow/core/kernels/data/shuffle_dataset_op.cc | 2 +- .../python/data/experimental/kernel_tests/BUILD | 13 +++++++ .../kernel_tests/random_dataset_test.py | 45 ++++++++++++++++++++++ .../kernel_tests/shuffle_and_repeat_test.py | 21 +++++++++- .../python/data/experimental/ops/random_ops.py | 21 ++++++++-- .../python/data/experimental/ops/shuffle_ops.py | 21 ++++++++-- tensorflow/python/data/kernel_tests/BUILD | 1 + .../data/kernel_tests/shuffle_dataset_op_test.py | 25 +++++++++++- tensorflow/python/data/ops/dataset_ops.py | 22 +++++++++-- tensorflow/python/data/util/BUILD | 1 + tensorflow/python/data/util/random_seed.py | 5 ++- tensorflow/python/data/util/random_seed_test.py | 13 ++++++- 12 files changed, 174 insertions(+), 16 deletions(-) create mode 100644 tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc index 66466d6a36..9f54c381a9 100644 --- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc +++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc @@ -485,7 +485,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase { int64 buffer_size, int64 seed, int64 seed2, int64 count) : ShuffleDatasetBase(ctx, input, buffer_size, count), seed_(seed), - seed2_(seed) {} + seed2_(seed2) {} string DebugString() const override { return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_, diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD index 4eef9580ad..a67f6ff031 100644 --- a/tensorflow/python/data/experimental/kernel_tests/BUILD +++ b/tensorflow/python/data/experimental/kernel_tests/BUILD @@ -453,6 +453,18 @@ cuda_py_test( tags = ["no_windows_gpu"], ) +py_test( + name = "random_dataset_test", + srcs = ["random_dataset_test.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/python/data/experimental/ops:random_ops", + "//tensorflow/python/data/kernel_tests:test_base", + "//tensorflow/python/data/ops:dataset_ops", + "@absl_py//absl/testing:parameterized", + ], +) + py_library( name = "reader_dataset_ops_test_base", testonly = 1, @@ -562,6 +574,7 @@ py_test( "//tensorflow/python/data/kernel_tests:test_base", "//tensorflow/python/data/ops:dataset_ops", "//third_party/py/numpy", + "@absl_py//absl/testing:parameterized", ], ) diff --git a/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py new file mode 100644 index 0000000000..d403a575ec --- /dev/null +++ b/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py @@ -0,0 +1,45 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for `tf.data.experimental.RandomDataset()`.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl.testing import parameterized + +from tensorflow.python.data.experimental.ops import random_ops +from tensorflow.python.data.kernel_tests import test_base +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import errors + + +class RandomDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): + + @parameterized.named_parameters( + ("NoSeed", None), + ("WithSeed", 42), + ) + def testZipRandomDataset(self, seed): + dataset = random_ops.RandomDataset(seed=seed).take(30) + dataset = dataset_ops.Dataset.zip((dataset, dataset)) + iterator = dataset.make_one_shot_iterator() + next_element = iterator.get_next() + + with self.cached_session() as sess: + for _ in range(30): + x, y = sess.run(next_element) + self.assertEqual(x, y) + with self.assertRaises(errors.OutOfRangeError): + sess.run(next_element) diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py index c208963a86..883169495f 100644 --- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py @@ -17,6 +17,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import shuffle_ops @@ -27,7 +28,7 @@ from tensorflow.python.framework import ops from tensorflow.python.platform import test -class ShuffleAndRepeatTest(test_base.DatasetTestBase): +class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase): def _build_ds(self, seed, count=5, num_elements=20): return dataset_ops.Dataset.range(num_elements).apply( @@ -110,6 +111,24 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): with self.session(graph=g) as sess: sess.run(get_next_op) + @parameterized.named_parameters( + ("NoSeed", None), + ("WithSeed", 42), + ) + def testShuffleAndRepeatAndZipDataset(self, seed): + dataset = dataset_ops.Dataset.range(10).apply( + shuffle_ops.shuffle_and_repeat(10, count=3, seed=seed)) + dataset = dataset_ops.Dataset.zip((dataset, dataset)) + iterator = dataset.make_one_shot_iterator() + next_element = iterator.get_next() + + with self.cached_session() as sess: + for _ in range(30): + x, y = sess.run(next_element) + self.assertEqual(x, y) + with self.assertRaises(errors.OutOfRangeError): + sess.run(next_element) + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py index e3a2aeab31..25d7fbf691 100644 --- a/tensorflow/python/data/experimental/ops/random_ops.py +++ b/tensorflow/python/data/experimental/ops/random_ops.py @@ -33,13 +33,26 @@ class RandomDataset(dataset_ops.DatasetSource): def __init__(self, seed=None): """A `Dataset` of pseudorandom values.""" super(RandomDataset, self).__init__() - self._seed, self._seed2 = random_seed.get_seed(seed) + + # NOTE(mrry): We generate the seed-pair once per graph in which the dataset + # is iterated over, and cache it in `self._graph_seed_map`. This supports + # two features: iterating over the same `ShuffleDataset` twice in the same + # pipeline and observing the same order (by tying the seeds together with + # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`, + # which requires the stateful RNG op to be created inside the same graph as + # the dataset. + self._original_seed = seed + self._graph_seed_map = {} def _as_variant_tensor(self): + try: + seed, seed2 = self._graph_seed_map[ops.get_default_graph()] + except KeyError: + seed, seed2 = random_seed.get_seed(self._original_seed) + self._graph_seed_map[ops.get_default_graph()] = (seed, seed2) + return gen_dataset_ops.random_dataset( - seed=self._seed, - seed2=self._seed2, - **dataset_ops.flat_structure(self)) + seed=seed, seed2=seed2, **dataset_ops.flat_structure(self)) @property def output_classes(self): diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py index a4307212da..a82e4b7d09 100644 --- a/tensorflow/python/data/experimental/ops/shuffle_ops.py +++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py @@ -39,17 +39,32 @@ class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset): else: self._count = ops.convert_to_tensor( count, dtype=dtypes.int64, name="count") - self._seed, self._seed2 = random_seed.get_seed(seed) + + # NOTE(mrry): We generate the seed-pair once per graph in which the dataset + # is iterated over, and cache it in `self._graph_seed_map`. This supports + # two features: iterating over the same `ShuffleDataset` twice in the same + # pipeline and observing the same order (by tying the seeds together with + # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`, + # which requires the stateful RNG op to be created inside the same graph as + # the dataset. + self._original_seed = seed + self._graph_seed_map = {} def _as_variant_tensor(self): + try: + seed, seed2 = self._graph_seed_map[ops.get_default_graph()] + except KeyError: + seed, seed2 = random_seed.get_seed(self._original_seed) + self._graph_seed_map[ops.get_default_graph()] = (seed, seed2) + # pylint: disable=protected-access input_resource = self._input_dataset._as_variant_tensor() return gen_dataset_ops.shuffle_and_repeat_dataset( input_resource, buffer_size=self._buffer_size, count=self._count, - seed=self._seed, - seed2=self._seed2, + seed=seed, + seed2=seed2, **dataset_ops.flat_structure(self)) # pylint: enable=protected-access diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD index c7295d6e69..ecb24103b3 100644 --- a/tensorflow/python/data/kernel_tests/BUILD +++ b/tensorflow/python/data/kernel_tests/BUILD @@ -443,6 +443,7 @@ tf_py_test( srcs = ["shuffle_dataset_op_test.py"], additional_deps = [ ":test_base", + "@absl_py//absl/testing:parameterized", "//third_party/py/numpy", "//tensorflow/python:array_ops", "//tensorflow/python:client_testlib", diff --git a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py index 347af18576..6001721726 100644 --- a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py +++ b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py @@ -19,6 +19,7 @@ from __future__ import print_function import collections +from absl.testing import parameterized import numpy as np from tensorflow.python.data.kernel_tests import test_base @@ -31,7 +32,7 @@ from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -class ShuffleDatasetTest(test_base.DatasetTestBase): +class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): def testShuffleDataset(self): components = ( @@ -209,5 +210,27 @@ class ShuffleDatasetTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): sess.run(next_element) + @parameterized.named_parameters( + ("ReshuffleEachIterationNoSeed", None, True), + ("ReshuffleEachIterationWithSeed", 42, True), + ("NoReshuffleEachIterationNoSeed", None, False), + ("NoReshuffleEachIterationWithSeed", 42, False), + ) + def testShuffleAndZipDataset(self, seed, reshuffle): + dataset = (dataset_ops.Dataset.range(10) + .shuffle(10, seed=seed, reshuffle_each_iteration=reshuffle) + .repeat(3)) + dataset = dataset_ops.Dataset.zip((dataset, dataset)) + iterator = dataset.make_one_shot_iterator() + next_element = iterator.get_next() + + with self.cached_session() as sess: + for _ in range(30): + x, y = sess.run(next_element) + self.assertEqual(x, y) + with self.assertRaises(errors.OutOfRangeError): + sess.run(next_element) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py index b7e19055f2..2d036fd0d6 100644 --- a/tensorflow/python/data/ops/dataset_ops.py +++ b/tensorflow/python/data/ops/dataset_ops.py @@ -2254,18 +2254,34 @@ class ShuffleDataset(UnaryDataset): self._input_dataset = input_dataset self._buffer_size = ops.convert_to_tensor( buffer_size, dtype=dtypes.int64, name="buffer_size") - self._seed, self._seed2 = random_seed.get_seed(seed) + + # NOTE(mrry): We generate the seed-pair once per graph in which the dataset + # is iterated over, and cache it in `self._graph_seed_map`. This supports + # two features: iterating over the same `ShuffleDataset` twice in the same + # pipeline and observing the same order (by tying the seeds together with + # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`, + # which requires the stateful RNG op to be created inside the same graph as + # the dataset. + self._original_seed = seed + self._graph_seed_map = {} + if reshuffle_each_iteration is None: self._reshuffle_each_iteration = True else: self._reshuffle_each_iteration = reshuffle_each_iteration def _as_variant_tensor(self): + try: + seed, seed2 = self._graph_seed_map[ops.get_default_graph()] + except KeyError: + seed, seed2 = random_seed.get_seed(self._original_seed) + self._graph_seed_map[ops.get_default_graph()] = (seed, seed2) + return gen_dataset_ops.shuffle_dataset( self._input_dataset._as_variant_tensor(), # pylint: disable=protected-access buffer_size=self._buffer_size, - seed=self._seed, - seed2=self._seed2, + seed=seed, + seed2=seed2, reshuffle_each_iteration=self._reshuffle_each_iteration, **flat_structure(self)) diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD index 39082ce370..95bf3209d7 100644 --- a/tensorflow/python/data/util/BUILD +++ b/tensorflow/python/data/util/BUILD @@ -142,6 +142,7 @@ py_test( ":random_seed", "//tensorflow/python:client_testlib", "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:random_ops", "//tensorflow/python:util", ], ) diff --git a/tensorflow/python/data/util/random_seed.py b/tensorflow/python/data/util/random_seed.py index d5169f7a53..d24df6d957 100644 --- a/tensorflow/python/data/util/random_seed.py +++ b/tensorflow/python/data/util/random_seed.py @@ -24,6 +24,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import random_seed from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops def get_seed(seed): @@ -37,7 +38,7 @@ def get_seed(seed): Returns: A tuple of two `tf.int64` scalar tensors that should be used for the local - seed of the calling dataset. + seeds of the calling dataset. """ seed, seed2 = random_seed.get_seed(seed) if seed is None: @@ -45,7 +46,7 @@ def get_seed(seed): else: seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed") if seed2 is None: - seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2") + seed2 = random_ops.random_uniform([], 1, 2**63 - 1, dtype=dtypes.int64) else: with ops.name_scope("seed2") as scope: seed2 = ops.convert_to_tensor(seed2, dtype=dtypes.int64) diff --git a/tensorflow/python/data/util/random_seed_test.py b/tensorflow/python/data/util/random_seed_test.py index a809151e6e..5df2e38c62 100644 --- a/tensorflow/python/data/util/random_seed_test.py +++ b/tensorflow/python/data/util/random_seed_test.py @@ -41,7 +41,6 @@ class RandomSeedTest(test.TestCase): # (input_graph_seed, input_op_seed) # and output from get_seed: # (output_graph_seed, output_op_seed) - ((None, None), (0, 0)), ((None, 1), (random_seed.DEFAULT_GRAPH_SEED, 1)), ((1, 1), (1, 1)), ((0, 0), (0, 2**31 - 1)), # Avoid nondeterministic (0, 0) output @@ -78,6 +77,18 @@ class RandomSeedTest(test.TestCase): self.assertEqual((g_seed, op_seed), toutput, msg=msg) random_seed.set_random_seed(None) + @test_util.run_in_graph_and_eager_modes + def testNondeterministicRandomSeed(self): + random_seed.set_random_seed(None) + op_seeds = [] + for _ in range(50): + g_seed, op_seed = data_random_seed.get_seed(None) + g_seed = self.evaluate(g_seed) + op_seed = self.evaluate(op_seed) + self.assertEqual(0, g_seed) + self.assertNotEqual(0, op_seed) + op_seeds.append(op_seed) + self.assertGreater(len(set(op_seeds)), 1) if __name__ == '__main__': test.main() -- cgit v1.2.3