4 files changed, 67 insertions, 59 deletions
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 30c24fe24c..b1b935f1a5 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -877,7 +877,6 @@ tf_cuda_library(
         "util/bcast.h",
         "util/cuda_kernel_helper.h",
         "util/device_name_utils.h",
-        "util/env_var.h",
         "util/events_writer.h",
         "util/example_proto_fast_parsing.h",
         "util/example_proto_helper.h",
@@ -2059,6 +2058,7 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [
     "platform/snappy.h",
     "platform/tensor_coding.h",
     "platform/tracing.h",
+    "util/env_var.h",
 ]
 
 # Replicated for lib_internal and lib_internal_impl.
@@ -2098,6 +2098,7 @@ cc_library(
             "platform/*.cc",
             "platform/profile_utils/**/*.cc",
             "framework/resource_handle.cc",
+            "util/env_var.cc",
         ],
         exclude = [
             "**/*test*",
@@ -2453,7 +2454,6 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
     "framework/unique_tensor_references.h",
     "framework/variant.h",
     "util/command_line_flags.h",
-    "util/env_var.h",
     "util/equal_graph_def.h",
     "util/presized_cuckoo_map.h",
     "util/tensor_slice_set.h",
@@ -2529,6 +2529,7 @@ tf_cuda_library(
             "util/memmapped_file_system_writer.*",
             "util/stats_calculator.*",
             "util/version_info.cc",
+            "util/env_var.cc",
         ],
     ) + select({
         "//tensorflow:windows": [],
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 9515d8e62a..10bf006787 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <atomic>
 #include <utility>
 
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/fft.h"
 #include "tensorflow/stream_executor/lib/env.h"
@@ -163,6 +164,15 @@ StreamExecutor::StreamExecutor(PlatformKind platform_kind,
   CheckPlatformKindIsValid(platform_kind);
 }
 
+// Get per-device memory limit in bytes. Returns 0 if
+// TF_PER_DEVICE_MEMORY_LIMIT_MB environment variable is not set.
+static int64 GetMemoryLimitBytes() {
+  int64 value;
+  SE_CHECK_OK(tensorflow::ReadInt64FromEnvVar("TF_PER_DEVICE_MEMORY_LIMIT_MB",
+                                              0, &value));
+  return value * (1ll << 20);
+}
+
 StreamExecutor::StreamExecutor(
     const Platform *platform,
     std::unique_ptr<internal::StreamExecutorInterface> implementation)
@@ -172,7 +182,9 @@ StreamExecutor::StreamExecutor(
       background_threads_(new port::ThreadPool(
           port::Env::Default(), "stream_executor", kNumBackgroundThreads)),
       live_stream_count_(0),
-      tracing_enabled_(false) {
+      tracing_enabled_(false),
+      mem_alloc_bytes_(0),
+      memory_limit_bytes_(GetMemoryLimitBytes()) {
   if (port::Lowercase(platform_->Name()) == "cuda") {
     platform_kind_ = PlatformKind::kCuda;
   } else if (port::Lowercase(platform_->Name()) == "opencl") {
@@ -460,6 +472,14 @@ port::Status StreamExecutor::BlockHostUntilDone(Stream *stream) {
 }
 
 void *StreamExecutor::Allocate(uint64 size) {
+  if (memory_limit_bytes_ > 0 &&
+      mem_alloc_bytes_ + size > memory_limit_bytes_) {
+    LOG(WARNING) << "Not enough memory to allocate " << size << " on device "
+                 << device_ordinal_
+                 << " within provided limit. [used=" << mem_alloc_bytes_
+                 << ", limit=" << memory_limit_bytes_ << "]";
+    return nullptr;
+  }
   void *buf = implementation_->Allocate(size);
   VLOG(1) << "Called StreamExecutor::Allocate(size=" << size << ") returns "
           << buf << StackTraceIfVLOG10();
@@ -779,6 +799,7 @@ void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
     mutex_lock lock(mu_);
     mem_allocs_[opaque] = AllocRecord{
         bytes, ""};
+    mem_alloc_bytes_ += bytes;
   }
 }
 
@@ -789,6 +810,7 @@ void StreamExecutor::EraseAllocRecord(void *opaque) {
       LOG(ERROR) << "Deallocating unknown pointer: "
                  << port::Printf("0x%p", opaque);
     } else {
+      mem_alloc_bytes_ -= mem_allocs_[opaque].bytes;
       mem_allocs_.erase(opaque);
     }
   }
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 437f298616..d04025b681 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -699,6 +699,13 @@ class StreamExecutor {
   // The set of TraceListeners registered for this StreamExecutor.
   std::set<TraceListener*> listeners_ GUARDED_BY(mu_);
 
+  // Allocated memory in bytes.
+  int64 mem_alloc_bytes_;
+
+  // Memory limit in bytes. Value less or equal to 0 indicates there is no
+  // limit.
+  int64 memory_limit_bytes_;
+
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor);
 };
 
diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index 75da9bb835..cc99f8023a 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
@@ -16,68 +16,46 @@
 #
 #
 # A script to run multiple GPU tests in parallel controlled with an environment
-# variable. This script will assume that when it runs, one of the locks are
-# already released. So the program calling this script is expected to make sure
-# that only $TF_GPU_COUNT processes are running at any gien time.
+# variable.
 #
 # Required environment variables:
-#     TF_GPU_COUNT = Number of GPUs available. This HAS TO BE IN SYNC with the
-#                    value of --local_test_jobs flag for bazel.
-
-BASH_VER_MAJOR=$(echo ${BASH_VERSION} | cut -d '.' -f 1)
-BASH_VER_MINOR=$(echo ${BASH_VERSION} | cut -d '.' -f 2)
-
-if [[ ${BASH_VER_MAJOR} -lt 4 ]]; then
-  echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2
-  exit 1
-elif [[ ${BASH_VER_MAJOR} -eq 4 ]] && [[ ${BASH_VER_MINOR} -lt 2 ]]; then
-  echo "Insufficient bash version: ${BASH_VERSION} < 4.2" >&2
-  exit 1
-fi
-
-function is_absolute {
-  [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
-}
-
-RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST"
-function rlocation() {
-  if is_absolute "$1" ; then
-    # If the file path is already fully specified, simply return it.
-    echo "$1"
-  elif [[ -e "$TEST_SRCDIR/$1" ]]; then
-    # If the file exists in the $TEST_SRCDIR then just use it.
-    echo "$TEST_SRCDIR/$1"
-  elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then
-    # If a runfiles manifest file exists then use it.
-    echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')"
-  fi
-}
-
-TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})"
-shift
-
-# Make sure /var/lock exists, this may not be true under MSYS
-mkdir -p /var/lock
+#     TF_GPU_COUNT = Number of GPUs available.
 
 TF_GPU_COUNT=${TF_GPU_COUNT:-8}
+TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-4}
+# We want to allow running one of the following configs:
+#  - 4 tests per GPU on k80
+#  - 8 tests per GPU on p100
+# p100 has minimum 12G memory. Therefore, we should limit each test to 1.5G.
+# To leave some room in case we want to run more tests in parallel in the
+# future and to use a rounder number, we set it to 1G.
+export TF_PER_DEVICE_MEMORY_LIMIT_MB=1024
 
-for i in `seq 0 $((TF_GPU_COUNT-1))`; do
-  exec {lock_fd}>/var/lock/gpulock$i || exit 1
-  if flock -n "$lock_fd";
-  then
-    (
-      # This export only works within the brackets, so it is isolated to one
-      # single command.
-      export CUDA_VISIBLE_DEVICES=$i
-      echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES"
-      "$TEST_BINARY" $@
-    )
-    return_code=$?
-    flock -u "$lock_fd"
-    exit $return_code
-  fi
+mkdir -p /var/lock
+# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU
+# slots to run a test at.
+#
+# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU.
+# So, we iterate over TF_TESTS_PER_GPU first.
+for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do
+  for i in `seq 0 $((TF_GPU_COUNT-1))`; do
+    echo "Trying to lock GPU $i for index $j"
+    exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1
+    if flock -n "$lock_fd";
+    then
+      (
+        # This export only works within the brackets, so it is isolated to one
+        # single command.
+        export CUDA_VISIBLE_DEVICES=$i
+        echo "Running test $@ on GPU $CUDA_VISIBLE_DEVICES"
+        $@
+      )
+      return_code=$?
+      flock -u "$lock_fd"
+      exit $return_code
+    fi
+  done
 done
 
 echo "Cannot find a free GPU to run the test $* on, exiting with failure..."
 exit 1
-