1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Train a ConvNet on MNIST using K-FAC.
Distributed training with sync replicas optimizer. See
`convnet.train_mnist_distributed_sync_replicas` for details.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
import tensorflow as tf
from tensorflow.contrib.kfac.examples import convnet
FLAGS = flags.FLAGS
flags.DEFINE_integer("task", -1, "Task identifier")
flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
flags.DEFINE_string(
"cov_inv_op_strategy", "chief_worker",
"In dist training mode run the cov, inv ops on chief or dedicated workers."
)
flags.DEFINE_string("master", "local", "Session master.")
flags.DEFINE_integer("ps_tasks", 2,
"Number of tasks in the parameter server job.")
flags.DEFINE_integer("replicas_to_aggregate", 5,
"Number of replicas to aggregate.")
flags.DEFINE_integer("worker_replicas", 5, "Number of replicas in worker job.")
flags.DEFINE_integer("num_epochs", None, "Number of epochs.")
def _is_chief():
"""Determines whether a job is the chief worker."""
if "chief_worker" in FLAGS.brain_jobs:
return FLAGS.brain_job_name == "chief_worker"
else:
return FLAGS.task == 0
def main(unused_argv):
_ = unused_argv
convnet.train_mnist_distributed_sync_replicas(
FLAGS.task, _is_chief(), FLAGS.worker_replicas, FLAGS.ps_tasks,
FLAGS.master, FLAGS.data_dir, FLAGS.num_epochs, FLAGS.cov_inv_op_strategy)
if __name__ == "__main__":
tf.app.run(main=main)
|