aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/tools/dist_test/remote_test.sh
blob: 935535312d326a70aaf949332435569567cb1ca7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env bash
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
# This is the entry-point script to testing TensorFlow's distributed runtime.
# It builds a docker image with the necessary gcloud and Kubernetes (k8s) tools
# installed, and then execute k8s cluster preparation and distributed TensorFlow
# runs from within a container based on the image.
#
# Usage:
#   remote_test.sh <whl_url>
#                  [--setup_cluster_only]
#                  [--num_workers <NUM_WORKERS>]
#                  [--num_parameter_servers <NUM_PARAMETER_SERVERS>]
#                  [--sync_replicas]
#
# Arguments:
# <whl_url>
#   Specify custom TensorFlow whl file URL to install in the test Docker image.
#
# --setup_cluster_only:
#       Setup the TensorFlow k8s cluster only, and do not perform testing of
#       the distributed runtime.
#
# --num_workers <NUM_WORKERS>:
#   Specifies the number of worker pods to start
#
# --num_parameter_server <NUM_PARAMETER_SERVERS>:
#   Specifies the number of parameter servers to start
#
# --sync_replicas
#   Use the synchronized-replica mode. The parameter updates from the replicas
#   (workers) will be aggregated before applied, which avoids stale parameter
#   updates.
#
#
#
# If any of the following environment variable has non-empty values, it will
# be mapped into the docker container to override the default values (see
# dist_test.sh)
#   TF_DIST_GRPC_SERVER_URL:      URL to an existing TensorFlow GRPC server.
#                                 If set to any non-empty and valid value (e.g.,
#                                 grpc://1.2.3.4:2222), it will cause the test
#                                 to bypass the k8s cluster setup and
#                                 teardown process, and just use the this URL
#                                 as the master session.
#   TF_DIST_GCLOUD_PROJECT:       gcloud project in which the GKE cluster
#                                 will be created (takes effect only if
#                                 TF_DIST_GRPC_SERVER_URL is empty, same below)
#   TF_DIST_GCLOUD_COMPUTE_ZONE:  gcloud compute zone.
#   TF_DIST_CONTAINER_CLUSTER:    name of the GKE cluster
#   TF_DIST_GCLOUD_KEY_FILE:      path to the gloud service JSON key file
#   TF_DIST_GRPC_PORT:            port on which to create the TensorFlow GRPC
#                                 servers
#   TF_DIST_DOCKER_NO_CACHE:      do not use cache when building docker images

die() {
  echo $@
  exit 1
}

DOCKER_IMG_NAME="tensorflow/tf-dist-test-client"

# Get current script directory
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Prepare environment variables for the docker container
DOCKER_ENV_FLAGS=""
if [[ ! -z "$TF_DIST_GRPC_SERVER_URL" ]]; then
  DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\
"-e TF_DIST_GRPC_SERVER_URL=${TF_DIST_GRPC_SERVER_URL}"
fi
if [[ ! -z "$TF_DIST_GCLOUD_PROJECT" ]]; then
  DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\
"-e TF_DIST_GCLOUD_PROJECT=${TF_DIST_GCLOUD_PROJECT}"
fi
if [[ ! -z "$TF_DIST_GCLOUD_COMPUTE_ZONE" ]]; then
  DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\
"-e TF_DIST_GCLOUD_COMPUTE_ZONE=${TF_DIST_GCLOUD_COMPUTE_ZONE}"
fi
if [[ ! -z "$TF_DIST_CONTAINER_CLUSTER" ]]; then
  DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\
"-e TF_DIST_CONTAINER_CLUSTER=${TF_DIST_CONTAINER_CLUSTER}"
fi
if [[ ! -z "$TF_DIST_GRPC_PORT" ]]; then
  DOCKER_ENV_FLAGS="${DOCKER_ENV_FLAGS} "\
"-e TF_DIST_GRPC_PORT=${TF_DIST_GRPC_PORT}"
fi

NO_CACHE_FLAG=""
if [[ ! -z "${TF_DIST_DOCKER_NO_CACHE}" ]] &&
   [[ "${TF_DIST_DOCKER_NO_CACHE}" != "0" ]]; then
  NO_CACHE_FLAG="--no-cache"
fi

# Parse command-line arguments.
WHL_URL=${1}
if [[ -z "${WHL_URL}" ]]; then
  die "whl URL is not specified"
fi

# Create docker build context directory.
BUILD_DIR=$(mktemp -d)
echo ""
echo "Using custom whl file URL: ${WHL_URL}"
echo "Building in temporary directory: ${BUILD_DIR}"

cp -r ${DIR}/* ${BUILD_DIR}/ || \
  die "Failed to copy files to ${BUILD_DIR}"

# Download whl file into the build context directory.
wget -P "${BUILD_DIR}" ${WHL_URL} || \
  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"

# Build docker image for test.
docker build ${NO_CACHE_FLAG} \
    -t ${DOCKER_IMG_NAME} -f "${BUILD_DIR}/Dockerfile" "${BUILD_DIR}" || \
    die "Failed to build docker image: ${DOCKER_IMG_NAME}"

# Clean up docker build context directory.
rm -rf "${BUILD_DIR}"

# Run docker image for test.
KEY_FILE=${TF_DIST_GCLOUD_KEY_FILE:-"${HOME}/gcloud-secrets/tensorflow-testing.json"}

docker run --rm -v ${KEY_FILE}:/var/gcloud/secrets/tensorflow-testing.json \
  ${DOCKER_ENV_FLAGS} \
  ${DOCKER_IMG_NAME} \
  /var/tf-dist-test/scripts/dist_test.sh $@