diff options
author | Anna R <annarev@google.com> | 2018-09-12 12:29:19 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-09-12 12:38:24 -0700 |
commit | 28e945e590b07de137f318a70896bc4fc31f7053 (patch) | |
tree | 59435a08e14420e284ac1c0aed60074bd2a71435 /tensorflow/tools/ci_build | |
parent | f337425dc71e3ea95aa91ce401a40c1b594486ca (diff) |
Internal change.
PiperOrigin-RevId: 212684548
Diffstat (limited to 'tensorflow/tools/ci_build')
-rwxr-xr-x | tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh | 90 |
1 files changed, 34 insertions, 56 deletions
diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh index 75da9bb835..cc99f8023a 100755 --- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh +++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh @@ -16,68 +16,46 @@ # # # A script to run multiple GPU tests in parallel controlled with an environment -# variable. This script will assume that when it runs, one of the locks are -# already released. So the program calling this script is expected to make sure -# that only $TF_GPU_COUNT processes are running at any gien time. +# variable. # # Required environment variables: -# TF_GPU_COUNT = Number of GPUs available. This HAS TO BE IN SYNC with the -# value of --local_test_jobs flag for bazel. - -BASH_VER_MAJOR=$(echo ${BASH_VERSION} | cut -d '.' -f 1) -BASH_VER_MINOR=$(echo ${BASH_VERSION} | cut -d '.' -f 2) - -if [[ ${BASH_VER_MAJOR} -lt 4 ]]; then - echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2 - exit 1 -elif [[ ${BASH_VER_MAJOR} -eq 4 ]] && [[ ${BASH_VER_MINOR} -lt 2 ]]; then - echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2 - exit 1 -fi - -function is_absolute { - [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]] -} - -RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST" -function rlocation() { - if is_absolute "$1" ; then - # If the file path is already fully specified, simply return it. - echo "$1" - elif [[ -e "$TEST_SRCDIR/$1" ]]; then - # If the file exists in the $TEST_SRCDIR then just use it. - echo "$TEST_SRCDIR/$1" - elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then - # If a runfiles manifest file exists then use it. - echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')" - fi -} - -TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})" -shift - -# Make sure /var/lock exists, this may not be true under MSYS -mkdir -p /var/lock +# TF_GPU_COUNT = Number of GPUs available. TF_GPU_COUNT=${TF_GPU_COUNT:-8} +TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-4} +# We want to allow running one of the following configs: +# - 4 tests per GPU on k80 +# - 8 tests per GPU on p100 +# p100 has minimum 12G memory. Therefore, we should limit each test to 1.5G. +# To leave some room in case we want to run more tests in parallel in the +# future and to use a rounder number, we set it to 1G. +export TF_PER_DEVICE_MEMORY_LIMIT_MB=1024 -for i in `seq 0 $((TF_GPU_COUNT-1))`; do - exec {lock_fd}>/var/lock/gpulock$i || exit 1 - if flock -n "$lock_fd"; - then - ( - # This export only works within the brackets, so it is isolated to one - # single command. - export CUDA_VISIBLE_DEVICES=$i - echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES" - "$TEST_BINARY" $@ - ) - return_code=$? - flock -u "$lock_fd" - exit $return_code - fi +mkdir -p /var/lock +# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU +# slots to run a test at. +# +# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU. +# So, we iterate over TF_TESTS_PER_GPU first. +for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do + for i in `seq 0 $((TF_GPU_COUNT-1))`; do + echo "Trying to lock GPU $i for index $j" + exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1 + if flock -n "$lock_fd"; + then + ( + # This export only works within the brackets, so it is isolated to one + # single command. + export CUDA_VISIBLE_DEVICES=$i + echo "Running test $@ on GPU $CUDA_VISIBLE_DEVICES" + $@ + ) + return_code=$? + flock -u "$lock_fd" + exit $return_code + fi + done done echo "Cannot find a free GPU to run the test $* on, exiting with failure..." exit 1 - |