diff options
author | 2016-07-19 10:51:12 -0800 | |
---|---|---|
committer | 2016-07-19 12:03:07 -0700 | |
commit | 3d3bbc69bb653f770976878127f107a4d40b1db3 (patch) | |
tree | 8a47dd27b9afc93d37ab27255411c5352b09626e | |
parent | 3dc784badfc6ee18e11ca6cf428da4a6d34213b9 (diff) |
Two fixes to tf-slim train().
1. When adding new ops, ensure they are added to the graph passed in the constructor.
2. Allow logdir to be None.
Change: 127856413
-rw-r--r-- | tensorflow/contrib/slim/python/slim/learning.py | 74 | ||||
-rw-r--r-- | tensorflow/contrib/slim/python/slim/learning_test.py | 90 |
2 files changed, 131 insertions, 33 deletions
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py index 14b31cb9f9..e0d24c2fbb 100644 --- a/tensorflow/contrib/slim/python/slim/learning.py +++ b/tensorflow/contrib/slim/python/slim/learning.py @@ -551,7 +551,8 @@ def train( Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. - logdir: The directory where training logs are written to. + logdir: The directory where training logs are written to. If None, model + checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. @@ -581,7 +582,7 @@ def train( save_summaries_secs: How often, in seconds, to save summaries. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. - saver: Saver to save checkpoints. If none, a default one will be created + saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the @@ -599,6 +600,12 @@ def train( if train_op is None: raise ValueError('train_op cannot be None.') + if logdir is None: + if summary_op != _USE_DEFAULT: + raise ValueError('Cannot provide summary_op because logdir=None') + if saver is not None: + raise ValueError('Cannot provide saver because logdir=None') + if sync_optimizer and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.') @@ -613,37 +620,37 @@ def train( global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() - if init_op == _USE_DEFAULT: - init_op = tf_variables.initialize_all_variables() - - if summary_op == _USE_DEFAULT: - summary_op = logging_ops.merge_all_summaries() - - cleanup_op = None - - if is_chief and sync_optimizer: - if not isinstance(sync_optimizer, - sync_replicas_optimizer.SyncReplicasOptimizer): - raise ValueError( - '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer') + if init_op == _USE_DEFAULT: + init_op = tf_variables.initialize_all_variables() - # Need to create these BEFORE the supervisor finalizes the graph: - with ops.control_dependencies([init_op]): - init_tokens_op = sync_optimizer.get_init_tokens_op() - init_op = init_tokens_op - chief_queue_runner = sync_optimizer.get_chief_queue_runner() - cleanup_op = sync_optimizer.get_clean_up_op() + if summary_op == _USE_DEFAULT: + summary_op = logging_ops.merge_all_summaries() - if train_step_kwargs == _USE_DEFAULT: - train_step_kwargs = {} + cleanup_op = None - if number_of_steps: - should_stop_op = math_ops.greater_equal(global_step, number_of_steps) - else: - should_stop_op = constant_op.constant(False) - train_step_kwargs['should_stop'] = should_stop_op - train_step_kwargs['should_log'] = math_ops.equal( - math_ops.mod(global_step, log_every_n_steps), 0) + if is_chief and sync_optimizer: + if not isinstance(sync_optimizer, + sync_replicas_optimizer.SyncReplicasOptimizer): + raise ValueError( + '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer') + + # Need to create these BEFORE the supervisor finalizes the graph: + with ops.control_dependencies([init_op]): + init_tokens_op = sync_optimizer.get_init_tokens_op() + init_op = init_tokens_op + chief_queue_runner = sync_optimizer.get_chief_queue_runner() + cleanup_op = sync_optimizer.get_clean_up_op() + + if train_step_kwargs == _USE_DEFAULT: + train_step_kwargs = {} + + if number_of_steps: + should_stop_op = math_ops.greater_equal(global_step, number_of_steps) + else: + should_stop_op = constant_op.constant(False) + train_step_kwargs['should_stop'] = should_stop_op + train_step_kwargs['should_log'] = math_ops.equal( + math_ops.mod(global_step, log_every_n_steps), 0) sv = supervisor.Supervisor( graph=graph, @@ -661,8 +668,9 @@ def train( with sv.managed_session(master, start_standard_services=False) as sess: if is_chief: - sv.start_standard_services(sess) - elif not is_chief and startup_delay_steps > 0: + if logdir: + sv.start_standard_services(sess) + elif startup_delay_steps > 0: _wait_for_step(sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) @@ -682,7 +690,7 @@ def train( # This waits for service threads to finish. sv.Stop() - if sv.is_chief: + if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py index 2fb5633e12..b57c8f8fe6 100644 --- a/tensorflow/contrib/slim/python/slim/learning_test.py +++ b/tensorflow/contrib/slim/python/slim/learning_test.py @@ -320,6 +320,88 @@ class TrainTest(tf.test.TestCase): j = int(2 * self._labels[i] + np.random.randint(0, 2)) self._inputs[i, j] = 1 + def testTrainWithNonDefaultGraph(self): + self._logdir = os.path.join(self.get_temp_dir(), 'tmp_logs8/') + g = tf.Graph() + with g.as_default(): + tf.set_random_seed(0) + tf_inputs = tf.constant(self._inputs, dtype=tf.float32) + tf_labels = tf.constant(self._labels, dtype=tf.float32) + + tf_predictions = LogisticClassifier(tf_inputs) + slim.losses.log_loss(tf_predictions, tf_labels) + total_loss = slim.losses.get_total_loss() + + optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) + + train_op = slim.learning.create_train_op(total_loss, optimizer) + + loss = slim.learning.train( + train_op, self._logdir, number_of_steps=300, log_every_n_steps=10, + graph=g) + self.assertIsNotNone(loss) + self.assertLess(loss, .015) + + def testTrainWithNoneAsLogdir(self): + g = tf.Graph() + with g.as_default(): + tf.set_random_seed(0) + tf_inputs = tf.constant(self._inputs, dtype=tf.float32) + tf_labels = tf.constant(self._labels, dtype=tf.float32) + + tf_predictions = LogisticClassifier(tf_inputs) + slim.losses.log_loss(tf_predictions, tf_labels) + total_loss = slim.losses.get_total_loss() + + optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) + + train_op = slim.learning.create_train_op(total_loss, optimizer) + + loss = slim.learning.train( + train_op, None, number_of_steps=300, log_every_n_steps=10) + self.assertIsNotNone(loss) + self.assertLess(loss, .015) + + def testTrainWithNoneAsLogdirWhenUsingSummariesRaisesError(self): + with tf.Graph().as_default(): + tf.set_random_seed(0) + tf_inputs = tf.constant(self._inputs, dtype=tf.float32) + tf_labels = tf.constant(self._labels, dtype=tf.float32) + + tf_predictions = LogisticClassifier(tf_inputs) + slim.losses.log_loss(tf_predictions, tf_labels) + total_loss = slim.losses.get_total_loss() + tf.scalar_summary('total_loss', total_loss) + + optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) + + train_op = slim.learning.create_train_op(total_loss, optimizer) + summary_op = tf.merge_all_summaries() + + with self.assertRaises(ValueError): + slim.learning.train( + train_op, None, number_of_steps=300, summary_op=summary_op) + + def testTrainWithNoneAsLogdirWhenUsingSaverRaisesError(self): + self._logdir = os.path.join(self.get_temp_dir(), 'tmp_logs_/') + with tf.Graph().as_default(): + tf.set_random_seed(0) + tf_inputs = tf.constant(self._inputs, dtype=tf.float32) + tf_labels = tf.constant(self._labels, dtype=tf.float32) + + tf_predictions = LogisticClassifier(tf_inputs) + slim.losses.log_loss(tf_predictions, tf_labels) + total_loss = slim.losses.get_total_loss() + + optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) + + train_op = slim.learning.create_train_op(total_loss, optimizer) + saver = tf.train.Saver() + + with self.assertRaises(ValueError): + slim.learning.train( + train_op, None, init_op=None, number_of_steps=300, saver=saver) + def testTrainWithNoneAsInitWhenUsingVarsRaisesError(self): self._logdir = os.path.join(self.get_temp_dir(), 'tmp_logs_/') with tf.Graph().as_default(): @@ -358,6 +440,7 @@ class TrainTest(tf.test.TestCase): loss = slim.learning.train( train_op, self._logdir, number_of_steps=300, log_every_n_steps=10) + self.assertIsNotNone(loss) self.assertLess(loss, .015) def testResumeTrainAchievesRoughlyTheSameLoss(self): @@ -382,6 +465,7 @@ class TrainTest(tf.test.TestCase): loss = slim.learning.train( train_op, self._logdir, number_of_steps=number_of_steps[i], log_every_n_steps=10) + self.assertIsNotNone(loss) self.assertLess(loss, .015) def create_train_op(self, learning_rate=1.0, gradient_multiplier=1.0): @@ -429,6 +513,7 @@ class TrainTest(tf.test.TestCase): train_op = self.create_train_op() loss = slim.learning.train( train_op, logdir1, number_of_steps=300, log_every_n_steps=10) + self.assertIsNotNone(loss) self.assertLess(loss, .02) # Finally, advance the model a single step and validate that the loss is @@ -455,6 +540,7 @@ class TrainTest(tf.test.TestCase): init_op=init_op, init_fn=InitAssignFn) + self.assertIsNotNone(loss) self.assertLess(loss, .02) def testTrainWithInitFromFn(self): @@ -481,6 +567,7 @@ class TrainTest(tf.test.TestCase): train_op = self.create_train_op() loss = slim.learning.train( train_op, logdir1, number_of_steps=300, log_every_n_steps=10) + self.assertIsNotNone(loss) self.assertLess(loss, .015) # Finally, advance the model a single step and validate that the loss is @@ -501,6 +588,7 @@ class TrainTest(tf.test.TestCase): number_of_steps=1, init_fn=RestoreFn) + self.assertIsNotNone(loss) self.assertLess(loss, .015) def ModelLoss(self): @@ -563,6 +651,7 @@ class TrainTest(tf.test.TestCase): loss = slim.learning.train( train_op, logdir1, number_of_steps=400, log_every_n_steps=10) + self.assertIsNotNone(loss) self.assertLess(loss, .015) def testTrainingSubsetsOfVariablesOnlyUpdatesThoseVariables(self): @@ -658,6 +747,7 @@ class TrainTest(tf.test.TestCase): loss = slim.learning.train( train_op, logdir2, number_of_steps=number_of_steps) losses.append(loss) + self.assertIsNotNone(loss) self.assertLess(loss, .5) # The loss of the model trained with larger learning rate should |