Don't establish contexts on gpus not on visible_device_list.

Moves all the initialization code out of gpu_init.h and into gpu_device.cc, because we want the code that establishes peer mappings between GPUs to reside close to where the device selection order is made. Now the initialization code does nothing but calls the StreamExecutor platform initialization. This also checks that there are no duplicate entries in the visible_device_list. I tested this by running the following program: import time import tensorflow as tf c = tf.ConfigProto() c.gpu_options.visible_device_list="1" s = tf.Session(config=c) time.sleep(5) # nvidia-smi showed the context was established on device 1 but NOT 0 del s c.gpu_options.visible_device_list="1,0" s = tf.Session(config=c) time.sleep(30) # nvidia-smi showed the context was established on both device 0 and 1, # and the logs showed that the device ordering was 1->/gpu:0 and 0->/gpu:1, # as well as the fact that it tried to establish the peer mapping. del s c.gpu_options.visible_device_list="1,0,1" s = tf.Session(config=c) # failed Fixes #1888 Change: 131785661
author: Vijay Vasudevan <vrv@google.com> 2016-08-30 19:04:28 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-08-30 20:17:35 -0700
commit: 3e8c4fd7403659ec32b9fec90a78831043aa0786 (patch)
tree: 41cb60a4783af9eb341b36b5a1a0850556774d04
parent: 9d6467825c9f3adf8df40d0281fa97280b372205 (diff)
5 files changed, 177 insertions, 148 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 033d67772c..d644dfa037 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/stream_executor_util.h"
 
 namespace gpu = ::perftools::gputools;
 
@@ -710,13 +711,14 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
   return Status::OK();
 }
 
-static int GetDefaultMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) {
+static int GetDefaultMinGPUMultiprocessorCount(
+    gpu::Platform* gpu_manager, const std::vector<int>& visible_gpu_order) {
   static const int kDefaultMinGPUMultiprocessorCount = 8;
 
   // Find the highest multi-processor count across all visible GPUs.
   int max_count = -1;
-  for (int i = 0; i < gpu_manager->VisibleDeviceCount(); ++i) {
-    auto exec_status = gpu_manager->ExecutorForDevice(i);
+  for (int i = 0; i < visible_gpu_order.size(); ++i) {
+    auto exec_status = gpu_manager->ExecutorForDevice(visible_gpu_order[i]);
     if (!exec_status.ok()) {
       continue;
     }
@@ -733,12 +735,13 @@ static int GetDefaultMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) {
   }
 }
 
-static int GetMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) {
+static int GetMinGPUMultiprocessorCount(
+    gpu::Platform* gpu_manager, const std::vector<int>& visible_gpu_order) {
   const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
 
   if (tf_min_gpu_core_count == nullptr ||
       strcmp(tf_min_gpu_core_count, "") == 0) {
-    return GetDefaultMinGPUMultiprocessorCount(gpu_manager);
+    return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
   }
 
   int min_gpu_core_count = -1;
@@ -748,7 +751,8 @@ static int GetMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) {
     }
   }
 
-  int count = GetDefaultMinGPUMultiprocessorCount(gpu_manager);
+  int count =
+      GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
   LOG(ERROR) << "Invalid minimum GPU multiprocessor count: ["
              << tf_min_gpu_core_count << "]. "
              << "Using the default value: " << count;
@@ -810,10 +814,58 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
   return cuda_caps;
 }
 
+std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(
+    gpu::Platform* platform, const std::vector<int>& visible_gpu_order) {
+  std::unique_ptr<std::map<std::pair<int, int>, bool>> map(
+      new std::map<std::pair<int, int>, bool>);
+  for (int i = 0; i < visible_gpu_order.size(); ++i) {
+    const int i_gpu_id = visible_gpu_order[i];
+    for (int j = 0; j < visible_gpu_order.size(); ++j) {
+      const int j_gpu_id = visible_gpu_order[j];
+      gpu::StreamExecutor* from =
+          platform->ExecutorForDevice(i_gpu_id).ValueOrDie();
+      gpu::StreamExecutor* to =
+          platform->ExecutorForDevice(j_gpu_id).ValueOrDie();
+      (*map)[{i, j}] = from->CanEnablePeerAccessTo(to);
+    }
+  }
+
+  return map;
+}
+
+Status EnablePeerAccess(gpu::Platform* platform,
+                        const std::vector<int>& visible_gpu_order) {
+  for (int i = 0; i < visible_gpu_order.size(); ++i) {
+    const int i_gpu_id = visible_gpu_order[i];
+    for (int j = 0; j < visible_gpu_order.size(); ++j) {
+      const int j_gpu_id = visible_gpu_order[j];
+      // We have already validated that ExecutorForDevice() calls
+      // return OK.
+      gpu::StreamExecutor* from =
+          platform->ExecutorForDevice(i_gpu_id).ValueOrDie();
+      gpu::StreamExecutor* to =
+          platform->ExecutorForDevice(j_gpu_id).ValueOrDie();
+
+      if (from->CanEnablePeerAccessTo(to)) {
+        auto status = from->EnablePeerAccessTo(to);
+        if (!status.ok()) {
+          return errors::Internal(status.ToString());
+        }
+      } else {
+        LOG(INFO) << "cannot enable peer access from device ordinal "
+                  << i_gpu_id << " to device ordinal " << j_gpu_id;
+      }
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 Status BaseGPUDeviceFactory::GetValidDeviceIds(
     const string& visible_device_list, std::vector<int>* ids) {
+  TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
+
   gpu::Platform* gpu_manager = GPUMachineManager();
   if (gpu_manager == nullptr) {
     return Status::OK();
@@ -824,16 +876,6 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     return Status::OK();
   }
 
-  auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
-  if (cuda_supported_capabilities.empty()) {
-    return errors::FailedPrecondition(
-        "No supported cuda capabilities in binary.");
-  }
-  CudaVersion min_supported_capability = *std::min_element(
-      cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
-
-  int min_gpu_core_count = GetMinGPUMultiprocessorCount(gpu_manager);
-
   // If the user wants to remap the visible to virtual GPU mapping,
   // check for that here.
   std::vector<int> visible_gpu_order;
@@ -863,6 +905,96 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     }
   }
 
+  // Validate no repeats.
+  std::set<int> visible_device_set(visible_gpu_order.begin(),
+                                   visible_gpu_order.end());
+  if (visible_device_set.size() != visible_gpu_order.size()) {
+    return errors::InvalidArgument(
+        "visible_device_list contained "
+        "a duplicate entry: ",
+        visible_device_list);
+  }
+
+  bool new_gpu_found = false;
+  for (int i = 0; i < visible_gpu_order.size(); ++i) {
+    int gpu_id = visible_gpu_order[i];
+
+    // Only perform this once per visible gpu id.
+    if (visible_gpu_initialized_[gpu_id]) {
+      continue;
+    }
+
+    visible_gpu_initialized_[gpu_id] = true;
+    new_gpu_found = true;
+
+    auto executor = gpu_manager->ExecutorForDevice(gpu_id);
+    if (!executor.ok()) {
+      return StreamExecutorUtil::ConvertStatus(executor.status());
+    }
+
+    auto stream_exec = executor.ValueOrDie();
+    int64 free_bytes;
+    int64 total_bytes;
+    if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
+      // Logs internally on failure.
+      free_bytes = 0;
+      total_bytes = 0;
+    }
+    const auto& description = stream_exec->GetDeviceDescription();
+    int cc_major;
+    int cc_minor;
+    if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
+      // Logs internally on failure.
+      cc_major = 0;
+      cc_minor = 0;
+    }
+    LOG(INFO) << "Found device " << i << " with properties: "
+              << "\nname: " << description.name() << "\nmajor: " << cc_major
+              << " minor: " << cc_minor << " memoryClockRate (GHz) "
+              << description.clock_rate_ghz() << "\npciBusID "
+              << description.pci_bus_id() << "\nTotal memory: "
+              << strings::HumanReadableNumBytes(total_bytes)
+              << "\nFree memory: "
+              << strings::HumanReadableNumBytes(free_bytes);
+  }
+
+  if (new_gpu_found) {
+    // Enable peer access
+    TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
+
+    // Print out a matrix showing which devices can DMA to one
+    // another.
+    auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
+    string line_buf = "DMA: ";
+    for (int i = 0; i < visible_gpu_order.size(); ++i) {
+      strings::StrAppend(&line_buf, visible_gpu_order[i], " ");
+    }
+    LOG(INFO) << line_buf;
+    for (int i = 0; i < visible_gpu_order.size(); ++i) {
+      line_buf = strings::StrCat(visible_gpu_order[i], ":   ");
+      for (int j = 0; j < visible_gpu_order.size(); ++j) {
+        if ((*access_map)[{i, j}]) {
+          line_buf.append("Y ");
+        } else {
+          line_buf.append("N ");
+        }
+      }
+      LOG(INFO) << line_buf;
+    }
+  }
+
+  auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
+  if (cuda_supported_capabilities.empty()) {
+    return errors::FailedPrecondition(
+        "No supported cuda capabilities in binary.");
+  }
+  CudaVersion min_supported_capability = *std::min_element(
+      cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
+
+  int min_gpu_core_count =
+      GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
+
+  // Filter out devices that don't have the right capability or power.
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
     const int32 visible_gpu_id = visible_gpu_order[i];
     auto exec_status = gpu_manager->ExecutorForDevice(visible_gpu_id);
@@ -879,7 +1011,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     // Only GPUs with no less than the minimum supported compute capability is
     // accepted.
     if (device_capability < min_supported_capability) {
-      LOG(INFO) << "Ignoring physical gpu device "
+      LOG(INFO) << "Ignoring visible gpu device "
                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
                 << ") "
                 << "with Cuda compute capability " << device_capability
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index abe7c0f687..7bd909e9ce 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -129,6 +129,10 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // gpu ids'.
   Status GetValidDeviceIds(const string& visible_device_list,
                            std::vector<int>* ids);
+
+  // visible_gpu_initialized_[gpu_id] is true if visible GPU gpu_id
+  // has been initialized by the process.
+  std::unordered_map<int, bool> visible_gpu_initialized_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc
index b0535c7927..aa23e3cc61 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.cc
@@ -19,153 +19,27 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/stream_executor_util.h"
 
 namespace gpu = ::perftools::gputools;
 
 namespace tensorflow {
 
-namespace {
-
-std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(
-    gpu::Platform* platform, int device_count) {
-  auto* map = new std::map<std::pair<int, int>, bool>;
-  for (int i = 0; i < device_count; ++i) {
-    for (int j = 0; j < device_count; ++j) {
-      gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie();
-      gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie();
-      (*map)[{i, j}] = from->CanEnablePeerAccessTo(to);
-    }
-  }
-
-  return std::unique_ptr<std::map<std::pair<int, int>, bool>>{map};
-}
-
-Status EnablePeerAccess(gpu::Platform* platform, int device_count) {
-  for (int i = 0; i < device_count; ++i) {
-    for (int j = 0; j < device_count; ++j) {
-      // We have already validated that ExecutorForDevice() calls
-      // return OK.
-      gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie();
-      gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie();
-
-      if (from->CanEnablePeerAccessTo(to)) {
-        auto status = from->EnablePeerAccessTo(to);
-        if (!status.ok()) {
-          return errors::Internal(status.ToString());
-        }
-      } else {
-        LOG(INFO) << "cannot enable peer access from device ordinal " << i
-                  << " to device ordinal " << j;
-      }
-    }
-  }
-  return Status::OK();
-}
-
-namespace {
-
-// TODO(vrv): Move this out into a common header so it can be used
-// more widely.
-Status ConvertStatus(const perftools::gputools::port::Status& s) {
-  return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
-                                            static_cast<int>(s.code())),
-                                        s.error_message());
-}
-
-}  // namespace
-
-static Status InitGPU() {
+Status ValidateGPUMachineManager() {
   auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
   if (!result.ok()) {
-    return ConvertStatus(result.status());
-  }
-
-  gpu::Platform* platform = result.ValueOrDie();
-
-  int dev_count = platform->VisibleDeviceCount();
-
-  if (dev_count <= 0) {
-    LOG(INFO) << "No GPU devices available on machine.";
-    return Status::OK();
-  }
-
-  for (int i = 0; i < dev_count; ++i) {
-    auto executor = platform->ExecutorForDevice(i);
-    if (!executor.ok()) {
-      return ConvertStatus(executor.status());
-    }
-
-    auto stream_exec = executor.ValueOrDie();
-    int64 free_bytes;
-    int64 total_bytes;
-    if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
-      // Logs internally on failure.
-      free_bytes = 0;
-      total_bytes = 0;
-    }
-    const auto& description = stream_exec->GetDeviceDescription();
-    int cc_major;
-    int cc_minor;
-    if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
-      // Logs internally on failure.
-      cc_major = 0;
-      cc_minor = 0;
-    }
-    LOG(INFO) << "Found device " << i << " with properties: "
-              << "\nname: " << description.name() << "\nmajor: " << cc_major
-              << " minor: " << cc_minor << " memoryClockRate (GHz) "
-              << description.clock_rate_ghz() << "\npciBusID "
-              << description.pci_bus_id() << "\nTotal memory: "
-              << strings::HumanReadableNumBytes(total_bytes)
-              << "\nFree memory: "
-              << strings::HumanReadableNumBytes(free_bytes);
-  }
-
-  // Enable peer access
-  TF_RETURN_IF_ERROR(EnablePeerAccess(platform, dev_count));
-
-  // Print out a matrix showing which devices can DMA to one
-  // another.
-  auto access_map = GetPeerAccessMap(platform, dev_count);
-  string line_buf = "DMA: ";
-  for (int i = 0; i < dev_count; ++i) {
-    strings::StrAppend(&line_buf, i, " ");
-  }
-  LOG(INFO) << line_buf;
-  for (int i = 0; i < dev_count; ++i) {
-    line_buf = strings::StrCat(i, ":   ");
-    for (int j = 0; j < dev_count; ++j) {
-      if ((*access_map)[{i, j}]) {
-        line_buf.append("Y ");
-      } else {
-        line_buf.append("N ");
-      }
-    }
-    LOG(INFO) << line_buf;
+    return StreamExecutorUtil::ConvertStatus(result.status());
   }
 
   return Status::OK();
 }
 
-static Status InitModule() { return InitGPU(); }
-
-}  // namespace
-
 gpu::Platform* GPUMachineManager() {
-  // Create the machine manager singleton and initialize the GPUs only
-  // once.
-  static Status init = InitModule();
-  if (!init.ok()) {
-    LOG(WARNING)
-        << "Not initializing the GPU, could not create GPU MachineManager. "
-        << "Error: " << init;
-    return nullptr;
-  }
-
   auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
   if (!result.ok()) {
     LOG(FATAL) << "Could not find Platform with name CUDA";
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h
index fdab603eb8..927d05d5ba 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_
 #define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_
 
+#include "tensorflow/core/lib/core/status.h"
+
 namespace perftools {
 namespace gputools {
 class Platform;
@@ -24,9 +26,14 @@ class Platform;
 
 namespace tensorflow {
 
+// Initializes the CUDA platform and returns OK if the CUDA
+// platform could be initialized.
+Status ValidateGPUMachineManager();
+
 // Returns the GPU machine manager singleton, creating it and
 // initializing the GPUs on the machine if needed the first time it is
-// called.
+// called.  Must only be called when there is a valid GPU environment
+// in the process (e.g., ValidateGPUMachineManager() returns OK).
 perftools::gputools::Platform* GPUMachineManager();
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h
index f8debfe416..6a5ddec04c 100644
--- a/tensorflow/core/util/stream_executor_util.h
+++ b/tensorflow/core/util/stream_executor_util.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_STREAM_EXECUTOR_UTIL_H_
 
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
@@ -33,6 +35,16 @@ class StreamExecutorUtil {
     return perftools::gputools::DeviceMemory<T>(
         perftools::gputools::DeviceMemoryBase(ptr, t.TotalBytes()));
   }
+
+  // Converts from a StreamExecutor Status to a TensorFlow Status.
+  //
+  // This assumes that the error codes between the two implementations
+  // match.
+  static Status ConvertStatus(const perftools::gputools::port::Status& s) {
+    return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
+                                              static_cast<int>(s.code())),
+                                          s.error_message());
+  }
 };
 
 }  // namespace tensorflow
author	Vijay Vasudevan <vrv@google.com>	2016-08-30 19:04:28 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-08-30 20:17:35 -0700
commit	3e8c4fd7403659ec32b9fec90a78831043aa0786 (patch)
tree	41cb60a4783af9eb341b36b5a1a0850556774d04
parent	9d6467825c9f3adf8df40d0281fa97280b372205 (diff)