#!/usr/bin/env bash # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== # # Tests distributed TensorFlow on a locally running TF GRPC cluster. # # This script peforms the following steps: # 1) Build the docker-in-docker (dind) image capable of running docker and # Kubernetes (k8s) cluster inside. # 2) Run a container from the aforementioned image and start docker service # in it # 3) Call a script to launch a k8s TensorFlow GRPC cluster inside the container # and run the distributed test suite. # # Usage: local_test.sh # [--leave_container_running] # [--model_name ] # [--num_workers ] # [--num_parameter_servers ] # [--sync_replicas] # # E.g., local_test.sh --model_name CENSUS_WIDENDEEP # local_test.sh --num_workers 3 --num_parameter_servers 3 # # Arguments: # # Specify custom TensorFlow whl file URL to install in the test Docker image. # # --leave_container_running: Do not stop the docker-in-docker container after # the termination of the tests, e.g., for debugging # # --num_workers : # Specifies the number of worker pods to start # # --num_parameter_server : # Specifies the number of parameter servers to start # # --sync_replicas # Use the synchronized-replica mode. The parameter updates from the replicas # (workers) will be aggregated before applied, which avoids stale parameter # updates. # # # In addition, this script obeys the following environment variables: # TF_DIST_DOCKER_NO_CACHE: do not use cache when building docker images die() { echo $@ exit 1 } # Configurations DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster" LOCAL_K8S_CACHE=${HOME}/kubernetes # Helper function get_container_id_by_image_name() { # Get the id of a container by image name # Usage: get_docker_container_id_by_image_name echo $(docker ps | grep $1 | awk '{print $1}') } # Parse input arguments LEAVE_CONTAINER_RUNNING=0 MODEL_NAME="" MODEL_NAME_FLAG="" NUM_WORKERS=2 NUM_PARAMETER_SERVERS=2 SYNC_REPLICAS_FLAG="" WHL_URL=${1} if [[ -z "${WHL_URL}" ]]; then die "whl file URL is not specified" fi while true; do if [[ $1 == "--leave_container_running" ]]; then LEAVE_CONTAINER_RUNNING=1 elif [[ $1 == "--model_name" ]]; then MODEL_NAME="$2" MODEL_NAME_FLAG="--model_name ${MODEL_NAME}" elif [[ $1 == "--num_workers" ]]; then NUM_WORKERS=$2 elif [[ $1 == "--num_parameter_servers" ]]; then NUM_PARAMETER_SERVERS=$2 elif [[ $1 == "--sync_replicas" ]]; then SYNC_REPLICAS_FLAG="--sync_replicas" elif [[ $1 == "--whl_url" ]]; then WHL_URL=$2 fi shift if [[ -z $1 ]]; then break fi done echo "LEAVE_CONTAINER_RUNNING: ${LEAVE_CONTAINER_RUNNING}" echo "MODEL_NAME: \"${MODEL_NAME}\"" echo "NUM_WORKERS: ${NUM_WORKERS}" echo "NUM_PARAMETER_SERVERS: ${NUM_PARAMETER_SERVERS}" echo "SYNC_REPLICAS_FLAG: \"${SYNC_REPLICAS_FLAG}\"" # Current script directory DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Get utility functions source ${DIR}/scripts/utils.sh # Build docker-in-docker image for local k8s cluster. NO_CACHE_FLAG="" if [[ ! -z "${TF_DIST_DOCKER_NO_CACHE}" ]] && [[ "${TF_DIST_DOCKER_NO_CACHE}" != "0" ]]; then NO_CACHE_FLAG="--no-cache" fi # Create docker build context directory. BUILD_DIR=$(mktemp -d) echo "" echo "Using whl file URL: ${WHL_URL}" echo "Building in temporary directory: ${BUILD_DIR}" cp -r ${DIR}/* "${BUILD_DIR}"/ || \ die "Failed to copy files to ${BUILD_DIR}" # Download whl file into the build context directory. wget -P "${BUILD_DIR}" ${WHL_URL} || \ die "Failed to download tensorflow whl file from URL: ${WHL_URL}" # Build docker image for test. docker build ${NO_CACHE_FLAG} -t ${DOCKER_IMG_NAME} \ -f "${BUILD_DIR}/Dockerfile.local" "${BUILD_DIR}" || \ die "Failed to build docker image: ${DOCKER_IMG_NAME}" # Clean up docker build context directory. rm -rf "${BUILD_DIR}" # Run docker image for test. docker run ${DOCKER_IMG_NAME} \ /var/tf_dist_test/scripts/dist_mnist_test.sh \ --ps_hosts "localhost:2000,localhost:2001" \ --worker_hosts "localhost:3000,localhost:3001" \ --num_gpus 0 ${SYNC_REPLICAS_FLAG}