aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/tools/ci_build
diff options
context:
space:
mode:
authorGravatar Anna R <annarev@google.com>2018-09-12 12:29:19 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-09-12 12:38:24 -0700
commit28e945e590b07de137f318a70896bc4fc31f7053 (patch)
tree59435a08e14420e284ac1c0aed60074bd2a71435 /tensorflow/tools/ci_build
parentf337425dc71e3ea95aa91ce401a40c1b594486ca (diff)
Internal change.
PiperOrigin-RevId: 212684548
Diffstat (limited to 'tensorflow/tools/ci_build')
-rwxr-xr-xtensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh90
1 files changed, 34 insertions, 56 deletions
diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index 75da9bb835..cc99f8023a 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
@@ -16,68 +16,46 @@
#
#
# A script to run multiple GPU tests in parallel controlled with an environment
-# variable. This script will assume that when it runs, one of the locks are
-# already released. So the program calling this script is expected to make sure
-# that only $TF_GPU_COUNT processes are running at any gien time.
+# variable.
#
# Required environment variables:
-# TF_GPU_COUNT = Number of GPUs available. This HAS TO BE IN SYNC with the
-# value of --local_test_jobs flag for bazel.
-
-BASH_VER_MAJOR=$(echo ${BASH_VERSION} | cut -d '.' -f 1)
-BASH_VER_MINOR=$(echo ${BASH_VERSION} | cut -d '.' -f 2)
-
-if [[ ${BASH_VER_MAJOR} -lt 4 ]]; then
- echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2
- exit 1
-elif [[ ${BASH_VER_MAJOR} -eq 4 ]] && [[ ${BASH_VER_MINOR} -lt 2 ]]; then
- echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2
- exit 1
-fi
-
-function is_absolute {
- [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
-}
-
-RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST"
-function rlocation() {
- if is_absolute "$1" ; then
- # If the file path is already fully specified, simply return it.
- echo "$1"
- elif [[ -e "$TEST_SRCDIR/$1" ]]; then
- # If the file exists in the $TEST_SRCDIR then just use it.
- echo "$TEST_SRCDIR/$1"
- elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then
- # If a runfiles manifest file exists then use it.
- echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')"
- fi
-}
-
-TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})"
-shift
-
-# Make sure /var/lock exists, this may not be true under MSYS
-mkdir -p /var/lock
+# TF_GPU_COUNT = Number of GPUs available.
TF_GPU_COUNT=${TF_GPU_COUNT:-8}
+TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-4}
+# We want to allow running one of the following configs:
+# - 4 tests per GPU on k80
+# - 8 tests per GPU on p100
+# p100 has minimum 12G memory. Therefore, we should limit each test to 1.5G.
+# To leave some room in case we want to run more tests in parallel in the
+# future and to use a rounder number, we set it to 1G.
+export TF_PER_DEVICE_MEMORY_LIMIT_MB=1024
-for i in `seq 0 $((TF_GPU_COUNT-1))`; do
- exec {lock_fd}>/var/lock/gpulock$i || exit 1
- if flock -n "$lock_fd";
- then
- (
- # This export only works within the brackets, so it is isolated to one
- # single command.
- export CUDA_VISIBLE_DEVICES=$i
- echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES"
- "$TEST_BINARY" $@
- )
- return_code=$?
- flock -u "$lock_fd"
- exit $return_code
- fi
+mkdir -p /var/lock
+# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU
+# slots to run a test at.
+#
+# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU.
+# So, we iterate over TF_TESTS_PER_GPU first.
+for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do
+ for i in `seq 0 $((TF_GPU_COUNT-1))`; do
+ echo "Trying to lock GPU $i for index $j"
+ exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1
+ if flock -n "$lock_fd";
+ then
+ (
+ # This export only works within the brackets, so it is isolated to one
+ # single command.
+ export CUDA_VISIBLE_DEVICES=$i
+ echo "Running test $@ on GPU $CUDA_VISIBLE_DEVICES"
+ $@
+ )
+ return_code=$?
+ flock -u "$lock_fd"
+ exit $return_code
+ fi
+ done
done
echo "Cannot find a free GPU to run the test $* on, exiting with failure..."
exit 1
-