aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2016-07-19 10:51:12 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-07-19 12:03:07 -0700
commit3d3bbc69bb653f770976878127f107a4d40b1db3 (patch)
tree8a47dd27b9afc93d37ab27255411c5352b09626e
parent3dc784badfc6ee18e11ca6cf428da4a6d34213b9 (diff)
Two fixes to tf-slim train().
1. When adding new ops, ensure they are added to the graph passed in the constructor. 2. Allow logdir to be None. Change: 127856413
-rw-r--r--tensorflow/contrib/slim/python/slim/learning.py74
-rw-r--r--tensorflow/contrib/slim/python/slim/learning_test.py90
2 files changed, 131 insertions, 33 deletions
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 14b31cb9f9..e0d24c2fbb 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -551,7 +551,8 @@ def train(
Args:
train_op: A `Tensor` that, when executed, will apply the gradients and
return the loss value.
- logdir: The directory where training logs are written to.
+ logdir: The directory where training logs are written to. If None, model
+ checkpoints and summaries will not be written.
train_step_fn: The function to call in order to execute a single gradient
step. The function must have take exactly four arguments: the current
session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
@@ -581,7 +582,7 @@ def train(
save_summaries_secs: How often, in seconds, to save summaries.
startup_delay_steps: The number of steps to wait for before beginning. Note
that this must be 0 if a sync_optimizer is supplied.
- saver: Saver to save checkpoints. If none, a default one will be created
+ saver: Saver to save checkpoints. If None, a default one will be created
and used.
save_interval_secs: How often, in seconds, to save the model to `logdir`.
sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
@@ -599,6 +600,12 @@ def train(
if train_op is None:
raise ValueError('train_op cannot be None.')
+ if logdir is None:
+ if summary_op != _USE_DEFAULT:
+ raise ValueError('Cannot provide summary_op because logdir=None')
+ if saver is not None:
+ raise ValueError('Cannot provide saver because logdir=None')
+
if sync_optimizer and startup_delay_steps > 0:
raise ValueError(
'startup_delay_steps must be zero when sync_optimizer is supplied.')
@@ -613,37 +620,37 @@ def train(
global_step = variables.get_or_create_global_step()
saver = saver or tf_saver.Saver()
- if init_op == _USE_DEFAULT:
- init_op = tf_variables.initialize_all_variables()
-
- if summary_op == _USE_DEFAULT:
- summary_op = logging_ops.merge_all_summaries()
-
- cleanup_op = None
-
- if is_chief and sync_optimizer:
- if not isinstance(sync_optimizer,
- sync_replicas_optimizer.SyncReplicasOptimizer):
- raise ValueError(
- '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer')
+ if init_op == _USE_DEFAULT:
+ init_op = tf_variables.initialize_all_variables()
- # Need to create these BEFORE the supervisor finalizes the graph:
- with ops.control_dependencies([init_op]):
- init_tokens_op = sync_optimizer.get_init_tokens_op()
- init_op = init_tokens_op
- chief_queue_runner = sync_optimizer.get_chief_queue_runner()
- cleanup_op = sync_optimizer.get_clean_up_op()
+ if summary_op == _USE_DEFAULT:
+ summary_op = logging_ops.merge_all_summaries()
- if train_step_kwargs == _USE_DEFAULT:
- train_step_kwargs = {}
+ cleanup_op = None
- if number_of_steps:
- should_stop_op = math_ops.greater_equal(global_step, number_of_steps)
- else:
- should_stop_op = constant_op.constant(False)
- train_step_kwargs['should_stop'] = should_stop_op
- train_step_kwargs['should_log'] = math_ops.equal(
- math_ops.mod(global_step, log_every_n_steps), 0)
+ if is_chief and sync_optimizer:
+ if not isinstance(sync_optimizer,
+ sync_replicas_optimizer.SyncReplicasOptimizer):
+ raise ValueError(
+ '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer')
+
+ # Need to create these BEFORE the supervisor finalizes the graph:
+ with ops.control_dependencies([init_op]):
+ init_tokens_op = sync_optimizer.get_init_tokens_op()
+ init_op = init_tokens_op
+ chief_queue_runner = sync_optimizer.get_chief_queue_runner()
+ cleanup_op = sync_optimizer.get_clean_up_op()
+
+ if train_step_kwargs == _USE_DEFAULT:
+ train_step_kwargs = {}
+
+ if number_of_steps:
+ should_stop_op = math_ops.greater_equal(global_step, number_of_steps)
+ else:
+ should_stop_op = constant_op.constant(False)
+ train_step_kwargs['should_stop'] = should_stop_op
+ train_step_kwargs['should_log'] = math_ops.equal(
+ math_ops.mod(global_step, log_every_n_steps), 0)
sv = supervisor.Supervisor(
graph=graph,
@@ -661,8 +668,9 @@ def train(
with sv.managed_session(master, start_standard_services=False) as sess:
if is_chief:
- sv.start_standard_services(sess)
- elif not is_chief and startup_delay_steps > 0:
+ if logdir:
+ sv.start_standard_services(sess)
+ elif startup_delay_steps > 0:
_wait_for_step(sess, global_step,
min(startup_delay_steps, number_of_steps or sys.maxint))
sv.start_queue_runners(sess)
@@ -682,7 +690,7 @@ def train(
# This waits for service threads to finish.
sv.Stop()
- if sv.is_chief:
+ if logdir and sv.is_chief:
logging.info('Finished training! Saving model to disk.')
sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index 2fb5633e12..b57c8f8fe6 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -320,6 +320,88 @@ class TrainTest(tf.test.TestCase):
j = int(2 * self._labels[i] + np.random.randint(0, 2))
self._inputs[i, j] = 1
+ def testTrainWithNonDefaultGraph(self):
+ self._logdir = os.path.join(self.get_temp_dir(), 'tmp_logs8/')
+ g = tf.Graph()
+ with g.as_default():
+ tf.set_random_seed(0)
+ tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+ tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+ tf_predictions = LogisticClassifier(tf_inputs)
+ slim.losses.log_loss(tf_predictions, tf_labels)
+ total_loss = slim.losses.get_total_loss()
+
+ optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+
+ train_op = slim.learning.create_train_op(total_loss, optimizer)
+
+ loss = slim.learning.train(
+ train_op, self._logdir, number_of_steps=300, log_every_n_steps=10,
+ graph=g)
+ self.assertIsNotNone(loss)
+ self.assertLess(loss, .015)
+
+ def testTrainWithNoneAsLogdir(self):
+ g = tf.Graph()
+ with g.as_default():
+ tf.set_random_seed(0)
+ tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+ tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+ tf_predictions = LogisticClassifier(tf_inputs)
+ slim.losses.log_loss(tf_predictions, tf_labels)
+ total_loss = slim.losses.get_total_loss()
+
+ optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+
+ train_op = slim.learning.create_train_op(total_loss, optimizer)
+
+ loss = slim.learning.train(
+ train_op, None, number_of_steps=300, log_every_n_steps=10)
+ self.assertIsNotNone(loss)
+ self.assertLess(loss, .015)
+
+ def testTrainWithNoneAsLogdirWhenUsingSummariesRaisesError(self):
+ with tf.Graph().as_default():
+ tf.set_random_seed(0)
+ tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+ tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+ tf_predictions = LogisticClassifier(tf_inputs)
+ slim.losses.log_loss(tf_predictions, tf_labels)
+ total_loss = slim.losses.get_total_loss()
+ tf.scalar_summary('total_loss', total_loss)
+
+ optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+
+ train_op = slim.learning.create_train_op(total_loss, optimizer)
+ summary_op = tf.merge_all_summaries()
+
+ with self.assertRaises(ValueError):
+ slim.learning.train(
+ train_op, None, number_of_steps=300, summary_op=summary_op)
+
+ def testTrainWithNoneAsLogdirWhenUsingSaverRaisesError(self):
+ self._logdir = os.path.join(self.get_temp_dir(), 'tmp_logs_/')
+ with tf.Graph().as_default():
+ tf.set_random_seed(0)
+ tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+ tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+ tf_predictions = LogisticClassifier(tf_inputs)
+ slim.losses.log_loss(tf_predictions, tf_labels)
+ total_loss = slim.losses.get_total_loss()
+
+ optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+
+ train_op = slim.learning.create_train_op(total_loss, optimizer)
+ saver = tf.train.Saver()
+
+ with self.assertRaises(ValueError):
+ slim.learning.train(
+ train_op, None, init_op=None, number_of_steps=300, saver=saver)
+
def testTrainWithNoneAsInitWhenUsingVarsRaisesError(self):
self._logdir = os.path.join(self.get_temp_dir(), 'tmp_logs_/')
with tf.Graph().as_default():
@@ -358,6 +440,7 @@ class TrainTest(tf.test.TestCase):
loss = slim.learning.train(
train_op, self._logdir, number_of_steps=300, log_every_n_steps=10)
+ self.assertIsNotNone(loss)
self.assertLess(loss, .015)
def testResumeTrainAchievesRoughlyTheSameLoss(self):
@@ -382,6 +465,7 @@ class TrainTest(tf.test.TestCase):
loss = slim.learning.train(
train_op, self._logdir, number_of_steps=number_of_steps[i],
log_every_n_steps=10)
+ self.assertIsNotNone(loss)
self.assertLess(loss, .015)
def create_train_op(self, learning_rate=1.0, gradient_multiplier=1.0):
@@ -429,6 +513,7 @@ class TrainTest(tf.test.TestCase):
train_op = self.create_train_op()
loss = slim.learning.train(
train_op, logdir1, number_of_steps=300, log_every_n_steps=10)
+ self.assertIsNotNone(loss)
self.assertLess(loss, .02)
# Finally, advance the model a single step and validate that the loss is
@@ -455,6 +540,7 @@ class TrainTest(tf.test.TestCase):
init_op=init_op,
init_fn=InitAssignFn)
+ self.assertIsNotNone(loss)
self.assertLess(loss, .02)
def testTrainWithInitFromFn(self):
@@ -481,6 +567,7 @@ class TrainTest(tf.test.TestCase):
train_op = self.create_train_op()
loss = slim.learning.train(
train_op, logdir1, number_of_steps=300, log_every_n_steps=10)
+ self.assertIsNotNone(loss)
self.assertLess(loss, .015)
# Finally, advance the model a single step and validate that the loss is
@@ -501,6 +588,7 @@ class TrainTest(tf.test.TestCase):
number_of_steps=1,
init_fn=RestoreFn)
+ self.assertIsNotNone(loss)
self.assertLess(loss, .015)
def ModelLoss(self):
@@ -563,6 +651,7 @@ class TrainTest(tf.test.TestCase):
loss = slim.learning.train(
train_op, logdir1, number_of_steps=400, log_every_n_steps=10)
+ self.assertIsNotNone(loss)
self.assertLess(loss, .015)
def testTrainingSubsetsOfVariablesOnlyUpdatesThoseVariables(self):
@@ -658,6 +747,7 @@ class TrainTest(tf.test.TestCase):
loss = slim.learning.train(
train_op, logdir2, number_of_steps=number_of_steps)
losses.append(loss)
+ self.assertIsNotNone(loss)
self.assertLess(loss, .5)
# The loss of the model trained with larger learning rate should