aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/tpu/python/tpu/tpu_config.py
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/contrib/tpu/python/tpu/tpu_config.py')
-rw-r--r--tensorflow/contrib/tpu/python/tpu/tpu_config.py58
1 files changed, 28 insertions, 30 deletions
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 6d7331e3c7..9e010922dc 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -23,8 +23,6 @@ import collections
import json
import os
-import numpy as np
-
from tensorflow.contrib.tpu.python.tpu import util as util_lib
from tensorflow.core.protobuf import config_pb2
from tensorflow.python.estimator import run_config as run_config_lib
@@ -43,6 +41,7 @@ class InputPipelineConfig(object):
PER_SHARD_V1 = 1
PER_HOST_V1 = 2
PER_HOST_V2 = 3
+ BROADCAST = 4
# TODO(b/72511246) Provide a simplified api to configure model parallelism.
@@ -50,7 +49,7 @@ class TPUConfig(
collections.namedtuple('TPUConfig', [
'iterations_per_loop',
'num_shards',
- 'computation_shape',
+ 'num_cores_per_replica',
'per_host_input_for_training',
'tpu_job_name',
'initial_infeed_sleep_secs',
@@ -67,22 +66,22 @@ class TPUConfig(
case, this number equals the total number of TPU cores. For
model-parallelism, the total number of TPU cores equals
product(computation_shape) * num_shards.
- computation_shape: Defaults to `None`, which disables model parallelism. A
- list of size 3 which describes the shape of a model replica's block of
- cores. This is required by model-parallelism which enables partitioning
- the model to multiple cores. For example, [2, 2, 1] means the model is
- partitioned across 4 cores which span two cores in both x and y
- coordinates. Please refer to @{tf.contrib.tpu.Topology} for the
- geometry of a TPU mesh.
+ num_cores_per_replica: Defaults to `None`, which disables model parallelism.
+ An integer which describes the number of TPU cores per model replica. This
+ is required by model-parallelism which enables partitioning
+ the model to multiple cores. Currently num_cores_per_replica must be
+ 1, 2, 4, or 8.
per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
- `input_fn` is invoked per-host rather than per-core. With per-host input
- pipeline configuration, `input_fn` is invoked once on each host. With the
- per-core input pipeline configuration, it is invoked once for each core.
+ `input_fn` is invoked once on each host. With the per-core input pipeline
+ configuration, it is invoked once for each core.
With a global batch size `train_batch_size` in `TPUEstimator` constructor,
the batch size for each shard is `train_batch_size` // #hosts in the
`True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
- `train_batch_size` // #cores. With the per-core input pipeline
- configuration, the shard batch size is also `train_batch_size` // #cores.
+ `train_batch_size` // #cores. In `BROADCAST` mode, `input_fn` is only
+ invoked once on host 0 and the tensors are broadcasted to all other
+ replicas. The batch size equals to train_batch_size`. With the per-core
+ input pipeline configuration, the shard batch size is also
+ `train_batch_size` // #cores.
Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
within TPUEstimator, however when using ClusterSpec propagation in more
@@ -99,7 +98,7 @@ class TPUConfig(
def __new__(cls,
iterations_per_loop=2,
num_shards=None,
- computation_shape=None,
+ num_cores_per_replica=None,
per_host_input_for_training=True,
tpu_job_name=None,
initial_infeed_sleep_secs=None):
@@ -112,19 +111,12 @@ class TPUConfig(
if num_shards is not None:
util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
- # Check computation_shape
- if computation_shape is not None and len(computation_shape) != 3:
- raise ValueError(
- 'computation_shape must be a list with length 3 or None; got {}'.
- format(str(computation_shape)))
-
- if computation_shape is not None:
- computation_shape_array = np.asarray(computation_shape, dtype=np.int32)
- # This prevents any computation being replicated across multiple hosts, so
- # that each host feeds the same number of computations.
- if any(computation_shape_array < 1) or any(computation_shape_array > 2):
- raise ValueError('computation_shape elements can only be 1 or 2; got '
- 'computation_shape={}'.format(computation_shape))
+ # Parse computation_shape
+ if num_cores_per_replica is not None:
+ if num_cores_per_replica not in [1, 2, 4, 8]:
+ raise ValueError(
+ 'num_cores_per_replica must be 1, 2, 4, or 8; got {}'.format(
+ str(num_cores_per_replica)))
# per_host_input_for_training may be True, False, or integer in [1..3].
# Map legacy values (True, False) to numeric values.
@@ -144,7 +136,7 @@ class TPUConfig(
cls,
iterations_per_loop=iterations_per_loop,
num_shards=num_shards,
- computation_shape=computation_shape,
+ num_cores_per_replica=num_cores_per_replica,
per_host_input_for_training=per_host_input_for_training,
tpu_job_name=tpu_job_name,
initial_infeed_sleep_secs=initial_infeed_sleep_secs)
@@ -214,6 +206,12 @@ class RunConfig(run_config_lib.RunConfig):
self._session_config.cluster_def.CopyFrom(
self._cluster_spec.as_cluster_def())
+ def _maybe_overwrite_session_config_for_distributed_training(self):
+ # Overrides the parent class session_config overwrite for between-graph. TPU
+ # runs with in-graph, which should not have device filter. Doing nothing
+ # ("pass") basically disables it.
+ pass
+
@property
def evaluation_master(self):
return self._evaluation_master