aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--tensorflow/core/BUILD5
-rw-r--r--tensorflow/stream_executor/stream_executor_pimpl.cc24
-rw-r--r--tensorflow/stream_executor/stream_executor_pimpl.h7
-rwxr-xr-xtensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh90
4 files changed, 67 insertions, 59 deletions
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 30c24fe24c..b1b935f1a5 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -877,7 +877,6 @@ tf_cuda_library(
"util/bcast.h",
"util/cuda_kernel_helper.h",
"util/device_name_utils.h",
- "util/env_var.h",
"util/events_writer.h",
"util/example_proto_fast_parsing.h",
"util/example_proto_helper.h",
@@ -2059,6 +2058,7 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [
"platform/snappy.h",
"platform/tensor_coding.h",
"platform/tracing.h",
+ "util/env_var.h",
]
# Replicated for lib_internal and lib_internal_impl.
@@ -2098,6 +2098,7 @@ cc_library(
"platform/*.cc",
"platform/profile_utils/**/*.cc",
"framework/resource_handle.cc",
+ "util/env_var.cc",
],
exclude = [
"**/*test*",
@@ -2453,7 +2454,6 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
"framework/unique_tensor_references.h",
"framework/variant.h",
"util/command_line_flags.h",
- "util/env_var.h",
"util/equal_graph_def.h",
"util/presized_cuckoo_map.h",
"util/tensor_slice_set.h",
@@ -2529,6 +2529,7 @@ tf_cuda_library(
"util/memmapped_file_system_writer.*",
"util/stats_calculator.*",
"util/version_info.cc",
+ "util/env_var.cc",
],
) + select({
"//tensorflow:windows": [],
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 9515d8e62a..10bf006787 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -22,6 +22,7 @@ limitations under the License.
#include <atomic>
#include <utility>
+#include "tensorflow/core/util/env_var.h"
#include "tensorflow/stream_executor/blas.h"
#include "tensorflow/stream_executor/fft.h"
#include "tensorflow/stream_executor/lib/env.h"
@@ -163,6 +164,15 @@ StreamExecutor::StreamExecutor(PlatformKind platform_kind,
CheckPlatformKindIsValid(platform_kind);
}
+// Get per-device memory limit in bytes. Returns 0 if
+// TF_PER_DEVICE_MEMORY_LIMIT_MB environment variable is not set.
+static int64 GetMemoryLimitBytes() {
+ int64 value;
+ SE_CHECK_OK(tensorflow::ReadInt64FromEnvVar("TF_PER_DEVICE_MEMORY_LIMIT_MB",
+ 0, &value));
+ return value * (1ll << 20);
+}
+
StreamExecutor::StreamExecutor(
const Platform *platform,
std::unique_ptr<internal::StreamExecutorInterface> implementation)
@@ -172,7 +182,9 @@ StreamExecutor::StreamExecutor(
background_threads_(new port::ThreadPool(
port::Env::Default(), "stream_executor", kNumBackgroundThreads)),
live_stream_count_(0),
- tracing_enabled_(false) {
+ tracing_enabled_(false),
+ mem_alloc_bytes_(0),
+ memory_limit_bytes_(GetMemoryLimitBytes()) {
if (port::Lowercase(platform_->Name()) == "cuda") {
platform_kind_ = PlatformKind::kCuda;
} else if (port::Lowercase(platform_->Name()) == "opencl") {
@@ -460,6 +472,14 @@ port::Status StreamExecutor::BlockHostUntilDone(Stream *stream) {
}
void *StreamExecutor::Allocate(uint64 size) {
+ if (memory_limit_bytes_ > 0 &&
+ mem_alloc_bytes_ + size > memory_limit_bytes_) {
+ LOG(WARNING) << "Not enough memory to allocate " << size << " on device "
+ << device_ordinal_
+ << " within provided limit. [used=" << mem_alloc_bytes_
+ << ", limit=" << memory_limit_bytes_ << "]";
+ return nullptr;
+ }
void *buf = implementation_->Allocate(size);
VLOG(1) << "Called StreamExecutor::Allocate(size=" << size << ") returns "
<< buf << StackTraceIfVLOG10();
@@ -779,6 +799,7 @@ void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
mutex_lock lock(mu_);
mem_allocs_[opaque] = AllocRecord{
bytes, ""};
+ mem_alloc_bytes_ += bytes;
}
}
@@ -789,6 +810,7 @@ void StreamExecutor::EraseAllocRecord(void *opaque) {
LOG(ERROR) << "Deallocating unknown pointer: "
<< port::Printf("0x%p", opaque);
} else {
+ mem_alloc_bytes_ -= mem_allocs_[opaque].bytes;
mem_allocs_.erase(opaque);
}
}
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 437f298616..d04025b681 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -699,6 +699,13 @@ class StreamExecutor {
// The set of TraceListeners registered for this StreamExecutor.
std::set<TraceListener*> listeners_ GUARDED_BY(mu_);
+ // Allocated memory in bytes.
+ int64 mem_alloc_bytes_;
+
+ // Memory limit in bytes. Value less or equal to 0 indicates there is no
+ // limit.
+ int64 memory_limit_bytes_;
+
SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor);
};
diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index 75da9bb835..cc99f8023a 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
@@ -16,68 +16,46 @@
#
#
# A script to run multiple GPU tests in parallel controlled with an environment
-# variable. This script will assume that when it runs, one of the locks are
-# already released. So the program calling this script is expected to make sure
-# that only $TF_GPU_COUNT processes are running at any gien time.
+# variable.
#
# Required environment variables:
-# TF_GPU_COUNT = Number of GPUs available. This HAS TO BE IN SYNC with the
-# value of --local_test_jobs flag for bazel.
-
-BASH_VER_MAJOR=$(echo ${BASH_VERSION} | cut -d '.' -f 1)
-BASH_VER_MINOR=$(echo ${BASH_VERSION} | cut -d '.' -f 2)
-
-if [[ ${BASH_VER_MAJOR} -lt 4 ]]; then
- echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2
- exit 1
-elif [[ ${BASH_VER_MAJOR} -eq 4 ]] && [[ ${BASH_VER_MINOR} -lt 2 ]]; then
- echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2
- exit 1
-fi
-
-function is_absolute {
- [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
-}
-
-RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST"
-function rlocation() {
- if is_absolute "$1" ; then
- # If the file path is already fully specified, simply return it.
- echo "$1"
- elif [[ -e "$TEST_SRCDIR/$1" ]]; then
- # If the file exists in the $TEST_SRCDIR then just use it.
- echo "$TEST_SRCDIR/$1"
- elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then
- # If a runfiles manifest file exists then use it.
- echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')"
- fi
-}
-
-TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})"
-shift
-
-# Make sure /var/lock exists, this may not be true under MSYS
-mkdir -p /var/lock
+# TF_GPU_COUNT = Number of GPUs available.
TF_GPU_COUNT=${TF_GPU_COUNT:-8}
+TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-4}
+# We want to allow running one of the following configs:
+# - 4 tests per GPU on k80
+# - 8 tests per GPU on p100
+# p100 has minimum 12G memory. Therefore, we should limit each test to 1.5G.
+# To leave some room in case we want to run more tests in parallel in the
+# future and to use a rounder number, we set it to 1G.
+export TF_PER_DEVICE_MEMORY_LIMIT_MB=1024
-for i in `seq 0 $((TF_GPU_COUNT-1))`; do
- exec {lock_fd}>/var/lock/gpulock$i || exit 1
- if flock -n "$lock_fd";
- then
- (
- # This export only works within the brackets, so it is isolated to one
- # single command.
- export CUDA_VISIBLE_DEVICES=$i
- echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES"
- "$TEST_BINARY" $@
- )
- return_code=$?
- flock -u "$lock_fd"
- exit $return_code
- fi
+mkdir -p /var/lock
+# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU
+# slots to run a test at.
+#
+# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU.
+# So, we iterate over TF_TESTS_PER_GPU first.
+for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do
+ for i in `seq 0 $((TF_GPU_COUNT-1))`; do
+ echo "Trying to lock GPU $i for index $j"
+ exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1
+ if flock -n "$lock_fd";
+ then
+ (
+ # This export only works within the brackets, so it is isolated to one
+ # single command.
+ export CUDA_VISIBLE_DEVICES=$i
+ echo "Running test $@ on GPU $CUDA_VISIBLE_DEVICES"
+ $@
+ )
+ return_code=$?
+ flock -u "$lock_fd"
+ exit $return_code
+ fi
+ done
done
echo "Cannot find a free GPU to run the test $* on, exiting with failure..."
exit 1
-