diff options
-rw-r--r-- | tensorflow/core/BUILD | 5 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream_executor_pimpl.cc | 24 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream_executor_pimpl.h | 7 | ||||
-rwxr-xr-x | tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh | 90 |
4 files changed, 67 insertions, 59 deletions
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 30c24fe24c..b1b935f1a5 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -877,7 +877,6 @@ tf_cuda_library( "util/bcast.h", "util/cuda_kernel_helper.h", "util/device_name_utils.h", - "util/env_var.h", "util/events_writer.h", "util/example_proto_fast_parsing.h", "util/example_proto_helper.h", @@ -2059,6 +2058,7 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [ "platform/snappy.h", "platform/tensor_coding.h", "platform/tracing.h", + "util/env_var.h", ] # Replicated for lib_internal and lib_internal_impl. @@ -2098,6 +2098,7 @@ cc_library( "platform/*.cc", "platform/profile_utils/**/*.cc", "framework/resource_handle.cc", + "util/env_var.cc", ], exclude = [ "**/*test*", @@ -2453,7 +2454,6 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [ "framework/unique_tensor_references.h", "framework/variant.h", "util/command_line_flags.h", - "util/env_var.h", "util/equal_graph_def.h", "util/presized_cuckoo_map.h", "util/tensor_slice_set.h", @@ -2529,6 +2529,7 @@ tf_cuda_library( "util/memmapped_file_system_writer.*", "util/stats_calculator.*", "util/version_info.cc", + "util/env_var.cc", ], ) + select({ "//tensorflow:windows": [], diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index 9515d8e62a..10bf006787 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -22,6 +22,7 @@ limitations under the License. #include <atomic> #include <utility> +#include "tensorflow/core/util/env_var.h" #include "tensorflow/stream_executor/blas.h" #include "tensorflow/stream_executor/fft.h" #include "tensorflow/stream_executor/lib/env.h" @@ -163,6 +164,15 @@ StreamExecutor::StreamExecutor(PlatformKind platform_kind, CheckPlatformKindIsValid(platform_kind); } +// Get per-device memory limit in bytes. Returns 0 if +// TF_PER_DEVICE_MEMORY_LIMIT_MB environment variable is not set. +static int64 GetMemoryLimitBytes() { + int64 value; + SE_CHECK_OK(tensorflow::ReadInt64FromEnvVar("TF_PER_DEVICE_MEMORY_LIMIT_MB", + 0, &value)); + return value * (1ll << 20); +} + StreamExecutor::StreamExecutor( const Platform *platform, std::unique_ptr<internal::StreamExecutorInterface> implementation) @@ -172,7 +182,9 @@ StreamExecutor::StreamExecutor( background_threads_(new port::ThreadPool( port::Env::Default(), "stream_executor", kNumBackgroundThreads)), live_stream_count_(0), - tracing_enabled_(false) { + tracing_enabled_(false), + mem_alloc_bytes_(0), + memory_limit_bytes_(GetMemoryLimitBytes()) { if (port::Lowercase(platform_->Name()) == "cuda") { platform_kind_ = PlatformKind::kCuda; } else if (port::Lowercase(platform_->Name()) == "opencl") { @@ -460,6 +472,14 @@ port::Status StreamExecutor::BlockHostUntilDone(Stream *stream) { } void *StreamExecutor::Allocate(uint64 size) { + if (memory_limit_bytes_ > 0 && + mem_alloc_bytes_ + size > memory_limit_bytes_) { + LOG(WARNING) << "Not enough memory to allocate " << size << " on device " + << device_ordinal_ + << " within provided limit. [used=" << mem_alloc_bytes_ + << ", limit=" << memory_limit_bytes_ << "]"; + return nullptr; + } void *buf = implementation_->Allocate(size); VLOG(1) << "Called StreamExecutor::Allocate(size=" << size << ") returns " << buf << StackTraceIfVLOG10(); @@ -779,6 +799,7 @@ void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) { mutex_lock lock(mu_); mem_allocs_[opaque] = AllocRecord{ bytes, ""}; + mem_alloc_bytes_ += bytes; } } @@ -789,6 +810,7 @@ void StreamExecutor::EraseAllocRecord(void *opaque) { LOG(ERROR) << "Deallocating unknown pointer: " << port::Printf("0x%p", opaque); } else { + mem_alloc_bytes_ -= mem_allocs_[opaque].bytes; mem_allocs_.erase(opaque); } } diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h index 437f298616..d04025b681 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.h +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -699,6 +699,13 @@ class StreamExecutor { // The set of TraceListeners registered for this StreamExecutor. std::set<TraceListener*> listeners_ GUARDED_BY(mu_); + // Allocated memory in bytes. + int64 mem_alloc_bytes_; + + // Memory limit in bytes. Value less or equal to 0 indicates there is no + // limit. + int64 memory_limit_bytes_; + SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor); }; diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh index 75da9bb835..cc99f8023a 100755 --- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh +++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh @@ -16,68 +16,46 @@ # # # A script to run multiple GPU tests in parallel controlled with an environment -# variable. This script will assume that when it runs, one of the locks are -# already released. So the program calling this script is expected to make sure -# that only $TF_GPU_COUNT processes are running at any gien time. +# variable. # # Required environment variables: -# TF_GPU_COUNT = Number of GPUs available. This HAS TO BE IN SYNC with the -# value of --local_test_jobs flag for bazel. - -BASH_VER_MAJOR=$(echo ${BASH_VERSION} | cut -d '.' -f 1) -BASH_VER_MINOR=$(echo ${BASH_VERSION} | cut -d '.' -f 2) - -if [[ ${BASH_VER_MAJOR} -lt 4 ]]; then - echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2 - exit 1 -elif [[ ${BASH_VER_MAJOR} -eq 4 ]] && [[ ${BASH_VER_MINOR} -lt 2 ]]; then - echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2 - exit 1 -fi - -function is_absolute { - [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]] -} - -RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST" -function rlocation() { - if is_absolute "$1" ; then - # If the file path is already fully specified, simply return it. - echo "$1" - elif [[ -e "$TEST_SRCDIR/$1" ]]; then - # If the file exists in the $TEST_SRCDIR then just use it. - echo "$TEST_SRCDIR/$1" - elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then - # If a runfiles manifest file exists then use it. - echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')" - fi -} - -TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})" -shift - -# Make sure /var/lock exists, this may not be true under MSYS -mkdir -p /var/lock +# TF_GPU_COUNT = Number of GPUs available. TF_GPU_COUNT=${TF_GPU_COUNT:-8} +TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-4} +# We want to allow running one of the following configs: +# - 4 tests per GPU on k80 +# - 8 tests per GPU on p100 +# p100 has minimum 12G memory. Therefore, we should limit each test to 1.5G. +# To leave some room in case we want to run more tests in parallel in the +# future and to use a rounder number, we set it to 1G. +export TF_PER_DEVICE_MEMORY_LIMIT_MB=1024 -for i in `seq 0 $((TF_GPU_COUNT-1))`; do - exec {lock_fd}>/var/lock/gpulock$i || exit 1 - if flock -n "$lock_fd"; - then - ( - # This export only works within the brackets, so it is isolated to one - # single command. - export CUDA_VISIBLE_DEVICES=$i - echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES" - "$TEST_BINARY" $@ - ) - return_code=$? - flock -u "$lock_fd" - exit $return_code - fi +mkdir -p /var/lock +# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU +# slots to run a test at. +# +# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU. +# So, we iterate over TF_TESTS_PER_GPU first. +for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do + for i in `seq 0 $((TF_GPU_COUNT-1))`; do + echo "Trying to lock GPU $i for index $j" + exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1 + if flock -n "$lock_fd"; + then + ( + # This export only works within the brackets, so it is isolated to one + # single command. + export CUDA_VISIBLE_DEVICES=$i + echo "Running test $@ on GPU $CUDA_VISIBLE_DEVICES" + $@ + ) + return_code=$? + flock -u "$lock_fd" + exit $return_code + fi + done done echo "Cannot find a free GPU to run the test $* on, exiting with failure..." exit 1 - |