aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Derek Murray <mrry@google.com>2018-10-08 14:17:24 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-10-08 14:24:34 -0700
commit09b0fc199129e0f487a39741bdf674cf09035cbc (patch)
treede5935da3271c91bc88dd5700100082b4fc00673
parent13b47e6c4f9d7b295948b1057139bf676e394b6f (diff)
[tf.data] Choose non-deterministic seed once per Python-level `Dataset` object.
This changes the behavior of randomness-introducing datasets (`tf.data.Dataset.shuffle()`, `tf.data.experimental.shuffle_and_repeat()`, and `tf.data.experimental.RandomDataset`). Previously, when you used the same `tf.data.Dataset` object multiple times in a pipeline (e.g. by zipping two datasets derived from the same randomness-introducing dataset) *and* you did not specify an explicit `seed`, the implementation would choose different non-deterministic seeds for each use of the `Dataset` object. With this change, the seed will be chosen once per `Dataset` (technically, once per `Dataset`-`Graph` combination, due to the vagaries of capturing state in `Dataset.make_one_shot_iterator()`), which means that all uses of the same dataset object will observe the same sequence of values. This change also revealed a small bug in how `Dataset.shuffle(..., reshuffle_each_iteration=False)` is serialized when an explicit seed is specified. The op-level seed was dropped, which could lead to non-deterministic behavior. This change fixes that issue by forwarding the op-level seed to the appropriate place. PiperOrigin-RevId: 216248013
-rw-r--r--tensorflow/core/kernels/data/shuffle_dataset_op.cc2
-rw-r--r--tensorflow/python/data/experimental/kernel_tests/BUILD13
-rw-r--r--tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py45
-rw-r--r--tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py21
-rw-r--r--tensorflow/python/data/experimental/ops/random_ops.py21
-rw-r--r--tensorflow/python/data/experimental/ops/shuffle_ops.py21
-rw-r--r--tensorflow/python/data/kernel_tests/BUILD1
-rw-r--r--tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py25
-rw-r--r--tensorflow/python/data/ops/dataset_ops.py22
-rw-r--r--tensorflow/python/data/util/BUILD1
-rw-r--r--tensorflow/python/data/util/random_seed.py5
-rw-r--r--tensorflow/python/data/util/random_seed_test.py13
12 files changed, 174 insertions, 16 deletions
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 66466d6a36..9f54c381a9 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -485,7 +485,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
int64 buffer_size, int64 seed, int64 seed2, int64 count)
: ShuffleDatasetBase(ctx, input, buffer_size, count),
seed_(seed),
- seed2_(seed) {}
+ seed2_(seed2) {}
string DebugString() const override {
return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 4eef9580ad..a67f6ff031 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -453,6 +453,18 @@ cuda_py_test(
tags = ["no_windows_gpu"],
)
+py_test(
+ name = "random_dataset_test",
+ srcs = ["random_dataset_test.py"],
+ srcs_version = "PY2AND3",
+ deps = [
+ "//tensorflow/python/data/experimental/ops:random_ops",
+ "//tensorflow/python/data/kernel_tests:test_base",
+ "//tensorflow/python/data/ops:dataset_ops",
+ "@absl_py//absl/testing:parameterized",
+ ],
+)
+
py_library(
name = "reader_dataset_ops_test_base",
testonly = 1,
@@ -562,6 +574,7 @@ py_test(
"//tensorflow/python/data/kernel_tests:test_base",
"//tensorflow/python/data/ops:dataset_ops",
"//third_party/py/numpy",
+ "@absl_py//absl/testing:parameterized",
],
)
diff --git a/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py
new file mode 100644
index 0000000000..d403a575ec
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py
@@ -0,0 +1,45 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.RandomDataset()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import random_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+
+
+class RandomDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+ @parameterized.named_parameters(
+ ("NoSeed", None),
+ ("WithSeed", 42),
+ )
+ def testZipRandomDataset(self, seed):
+ dataset = random_ops.RandomDataset(seed=seed).take(30)
+ dataset = dataset_ops.Dataset.zip((dataset, dataset))
+ iterator = dataset.make_one_shot_iterator()
+ next_element = iterator.get_next()
+
+ with self.cached_session() as sess:
+ for _ in range(30):
+ x, y = sess.run(next_element)
+ self.assertEqual(x, y)
+ with self.assertRaises(errors.OutOfRangeError):
+ sess.run(next_element)
diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
index c208963a86..883169495f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
+from absl.testing import parameterized
import numpy as np
from tensorflow.python.data.experimental.ops import shuffle_ops
@@ -27,7 +28,7 @@ from tensorflow.python.framework import ops
from tensorflow.python.platform import test
-class ShuffleAndRepeatTest(test_base.DatasetTestBase):
+class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase):
def _build_ds(self, seed, count=5, num_elements=20):
return dataset_ops.Dataset.range(num_elements).apply(
@@ -110,6 +111,24 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
with self.session(graph=g) as sess:
sess.run(get_next_op)
+ @parameterized.named_parameters(
+ ("NoSeed", None),
+ ("WithSeed", 42),
+ )
+ def testShuffleAndRepeatAndZipDataset(self, seed):
+ dataset = dataset_ops.Dataset.range(10).apply(
+ shuffle_ops.shuffle_and_repeat(10, count=3, seed=seed))
+ dataset = dataset_ops.Dataset.zip((dataset, dataset))
+ iterator = dataset.make_one_shot_iterator()
+ next_element = iterator.get_next()
+
+ with self.cached_session() as sess:
+ for _ in range(30):
+ x, y = sess.run(next_element)
+ self.assertEqual(x, y)
+ with self.assertRaises(errors.OutOfRangeError):
+ sess.run(next_element)
+
if __name__ == "__main__":
test.main()
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index e3a2aeab31..25d7fbf691 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -33,13 +33,26 @@ class RandomDataset(dataset_ops.DatasetSource):
def __init__(self, seed=None):
"""A `Dataset` of pseudorandom values."""
super(RandomDataset, self).__init__()
- self._seed, self._seed2 = random_seed.get_seed(seed)
+
+ # NOTE(mrry): We generate the seed-pair once per graph in which the dataset
+ # is iterated over, and cache it in `self._graph_seed_map`. This supports
+ # two features: iterating over the same `ShuffleDataset` twice in the same
+ # pipeline and observing the same order (by tying the seeds together with
+ # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`,
+ # which requires the stateful RNG op to be created inside the same graph as
+ # the dataset.
+ self._original_seed = seed
+ self._graph_seed_map = {}
def _as_variant_tensor(self):
+ try:
+ seed, seed2 = self._graph_seed_map[ops.get_default_graph()]
+ except KeyError:
+ seed, seed2 = random_seed.get_seed(self._original_seed)
+ self._graph_seed_map[ops.get_default_graph()] = (seed, seed2)
+
return gen_dataset_ops.random_dataset(
- seed=self._seed,
- seed2=self._seed2,
- **dataset_ops.flat_structure(self))
+ seed=seed, seed2=seed2, **dataset_ops.flat_structure(self))
@property
def output_classes(self):
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index a4307212da..a82e4b7d09 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -39,17 +39,32 @@ class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
else:
self._count = ops.convert_to_tensor(
count, dtype=dtypes.int64, name="count")
- self._seed, self._seed2 = random_seed.get_seed(seed)
+
+ # NOTE(mrry): We generate the seed-pair once per graph in which the dataset
+ # is iterated over, and cache it in `self._graph_seed_map`. This supports
+ # two features: iterating over the same `ShuffleDataset` twice in the same
+ # pipeline and observing the same order (by tying the seeds together with
+ # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`,
+ # which requires the stateful RNG op to be created inside the same graph as
+ # the dataset.
+ self._original_seed = seed
+ self._graph_seed_map = {}
def _as_variant_tensor(self):
+ try:
+ seed, seed2 = self._graph_seed_map[ops.get_default_graph()]
+ except KeyError:
+ seed, seed2 = random_seed.get_seed(self._original_seed)
+ self._graph_seed_map[ops.get_default_graph()] = (seed, seed2)
+
# pylint: disable=protected-access
input_resource = self._input_dataset._as_variant_tensor()
return gen_dataset_ops.shuffle_and_repeat_dataset(
input_resource,
buffer_size=self._buffer_size,
count=self._count,
- seed=self._seed,
- seed2=self._seed2,
+ seed=seed,
+ seed2=seed2,
**dataset_ops.flat_structure(self))
# pylint: enable=protected-access
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index c7295d6e69..ecb24103b3 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -443,6 +443,7 @@ tf_py_test(
srcs = ["shuffle_dataset_op_test.py"],
additional_deps = [
":test_base",
+ "@absl_py//absl/testing:parameterized",
"//third_party/py/numpy",
"//tensorflow/python:array_ops",
"//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
index 347af18576..6001721726 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
import collections
+from absl.testing import parameterized
import numpy as np
from tensorflow.python.data.kernel_tests import test_base
@@ -31,7 +32,7 @@ from tensorflow.python.ops import array_ops
from tensorflow.python.platform import test
-class ShuffleDatasetTest(test_base.DatasetTestBase):
+class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
def testShuffleDataset(self):
components = (
@@ -209,5 +210,27 @@ class ShuffleDatasetTest(test_base.DatasetTestBase):
with self.assertRaises(errors.OutOfRangeError):
sess.run(next_element)
+ @parameterized.named_parameters(
+ ("ReshuffleEachIterationNoSeed", None, True),
+ ("ReshuffleEachIterationWithSeed", 42, True),
+ ("NoReshuffleEachIterationNoSeed", None, False),
+ ("NoReshuffleEachIterationWithSeed", 42, False),
+ )
+ def testShuffleAndZipDataset(self, seed, reshuffle):
+ dataset = (dataset_ops.Dataset.range(10)
+ .shuffle(10, seed=seed, reshuffle_each_iteration=reshuffle)
+ .repeat(3))
+ dataset = dataset_ops.Dataset.zip((dataset, dataset))
+ iterator = dataset.make_one_shot_iterator()
+ next_element = iterator.get_next()
+
+ with self.cached_session() as sess:
+ for _ in range(30):
+ x, y = sess.run(next_element)
+ self.assertEqual(x, y)
+ with self.assertRaises(errors.OutOfRangeError):
+ sess.run(next_element)
+
+
if __name__ == "__main__":
test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index b7e19055f2..2d036fd0d6 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -2254,18 +2254,34 @@ class ShuffleDataset(UnaryDataset):
self._input_dataset = input_dataset
self._buffer_size = ops.convert_to_tensor(
buffer_size, dtype=dtypes.int64, name="buffer_size")
- self._seed, self._seed2 = random_seed.get_seed(seed)
+
+ # NOTE(mrry): We generate the seed-pair once per graph in which the dataset
+ # is iterated over, and cache it in `self._graph_seed_map`. This supports
+ # two features: iterating over the same `ShuffleDataset` twice in the same
+ # pipeline and observing the same order (by tying the seeds together with
+ # a randomly-generated seed), and using `Dataset.make_one_shot_iterator()`,
+ # which requires the stateful RNG op to be created inside the same graph as
+ # the dataset.
+ self._original_seed = seed
+ self._graph_seed_map = {}
+
if reshuffle_each_iteration is None:
self._reshuffle_each_iteration = True
else:
self._reshuffle_each_iteration = reshuffle_each_iteration
def _as_variant_tensor(self):
+ try:
+ seed, seed2 = self._graph_seed_map[ops.get_default_graph()]
+ except KeyError:
+ seed, seed2 = random_seed.get_seed(self._original_seed)
+ self._graph_seed_map[ops.get_default_graph()] = (seed, seed2)
+
return gen_dataset_ops.shuffle_dataset(
self._input_dataset._as_variant_tensor(), # pylint: disable=protected-access
buffer_size=self._buffer_size,
- seed=self._seed,
- seed2=self._seed2,
+ seed=seed,
+ seed2=seed2,
reshuffle_each_iteration=self._reshuffle_each_iteration,
**flat_structure(self))
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 39082ce370..95bf3209d7 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -142,6 +142,7 @@ py_test(
":random_seed",
"//tensorflow/python:client_testlib",
"//tensorflow/python:framework_for_generated_wrappers",
+ "//tensorflow/python:random_ops",
"//tensorflow/python:util",
],
)
diff --git a/tensorflow/python/data/util/random_seed.py b/tensorflow/python/data/util/random_seed.py
index d5169f7a53..d24df6d957 100644
--- a/tensorflow/python/data/util/random_seed.py
+++ b/tensorflow/python/data/util/random_seed.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import ops
from tensorflow.python.framework import random_seed
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
def get_seed(seed):
@@ -37,7 +38,7 @@ def get_seed(seed):
Returns:
A tuple of two `tf.int64` scalar tensors that should be used for the local
- seed of the calling dataset.
+ seeds of the calling dataset.
"""
seed, seed2 = random_seed.get_seed(seed)
if seed is None:
@@ -45,7 +46,7 @@ def get_seed(seed):
else:
seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
if seed2 is None:
- seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
+ seed2 = random_ops.random_uniform([], 1, 2**63 - 1, dtype=dtypes.int64)
else:
with ops.name_scope("seed2") as scope:
seed2 = ops.convert_to_tensor(seed2, dtype=dtypes.int64)
diff --git a/tensorflow/python/data/util/random_seed_test.py b/tensorflow/python/data/util/random_seed_test.py
index a809151e6e..5df2e38c62 100644
--- a/tensorflow/python/data/util/random_seed_test.py
+++ b/tensorflow/python/data/util/random_seed_test.py
@@ -41,7 +41,6 @@ class RandomSeedTest(test.TestCase):
# (input_graph_seed, input_op_seed)
# and output from get_seed:
# (output_graph_seed, output_op_seed)
- ((None, None), (0, 0)),
((None, 1), (random_seed.DEFAULT_GRAPH_SEED, 1)),
((1, 1), (1, 1)),
((0, 0), (0, 2**31 - 1)), # Avoid nondeterministic (0, 0) output
@@ -78,6 +77,18 @@ class RandomSeedTest(test.TestCase):
self.assertEqual((g_seed, op_seed), toutput, msg=msg)
random_seed.set_random_seed(None)
+ @test_util.run_in_graph_and_eager_modes
+ def testNondeterministicRandomSeed(self):
+ random_seed.set_random_seed(None)
+ op_seeds = []
+ for _ in range(50):
+ g_seed, op_seed = data_random_seed.get_seed(None)
+ g_seed = self.evaluate(g_seed)
+ op_seed = self.evaluate(op_seed)
+ self.assertEqual(0, g_seed)
+ self.assertNotEqual(0, op_seed)
+ op_seeds.append(op_seed)
+ self.assertGreater(len(set(op_seeds)), 1)
if __name__ == '__main__':
test.main()