aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.cc166
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.h4
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_init.cc134
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_init.h9
-rw-r--r--tensorflow/core/util/stream_executor_util.h12
5 files changed, 177 insertions, 148 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 033d67772c..d644dfa037 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -54,6 +54,7 @@ limitations under the License.
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/public/session_options.h"
#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/stream_executor_util.h"
namespace gpu = ::perftools::gputools;
@@ -710,13 +711,14 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
return Status::OK();
}
-static int GetDefaultMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) {
+static int GetDefaultMinGPUMultiprocessorCount(
+ gpu::Platform* gpu_manager, const std::vector<int>& visible_gpu_order) {
static const int kDefaultMinGPUMultiprocessorCount = 8;
// Find the highest multi-processor count across all visible GPUs.
int max_count = -1;
- for (int i = 0; i < gpu_manager->VisibleDeviceCount(); ++i) {
- auto exec_status = gpu_manager->ExecutorForDevice(i);
+ for (int i = 0; i < visible_gpu_order.size(); ++i) {
+ auto exec_status = gpu_manager->ExecutorForDevice(visible_gpu_order[i]);
if (!exec_status.ok()) {
continue;
}
@@ -733,12 +735,13 @@ static int GetDefaultMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) {
}
}
-static int GetMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) {
+static int GetMinGPUMultiprocessorCount(
+ gpu::Platform* gpu_manager, const std::vector<int>& visible_gpu_order) {
const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
if (tf_min_gpu_core_count == nullptr ||
strcmp(tf_min_gpu_core_count, "") == 0) {
- return GetDefaultMinGPUMultiprocessorCount(gpu_manager);
+ return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
}
int min_gpu_core_count = -1;
@@ -748,7 +751,8 @@ static int GetMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) {
}
}
- int count = GetDefaultMinGPUMultiprocessorCount(gpu_manager);
+ int count =
+ GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
LOG(ERROR) << "Invalid minimum GPU multiprocessor count: ["
<< tf_min_gpu_core_count << "]. "
<< "Using the default value: " << count;
@@ -810,10 +814,58 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
return cuda_caps;
}
+std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(
+ gpu::Platform* platform, const std::vector<int>& visible_gpu_order) {
+ std::unique_ptr<std::map<std::pair<int, int>, bool>> map(
+ new std::map<std::pair<int, int>, bool>);
+ for (int i = 0; i < visible_gpu_order.size(); ++i) {
+ const int i_gpu_id = visible_gpu_order[i];
+ for (int j = 0; j < visible_gpu_order.size(); ++j) {
+ const int j_gpu_id = visible_gpu_order[j];
+ gpu::StreamExecutor* from =
+ platform->ExecutorForDevice(i_gpu_id).ValueOrDie();
+ gpu::StreamExecutor* to =
+ platform->ExecutorForDevice(j_gpu_id).ValueOrDie();
+ (*map)[{i, j}] = from->CanEnablePeerAccessTo(to);
+ }
+ }
+
+ return map;
+}
+
+Status EnablePeerAccess(gpu::Platform* platform,
+ const std::vector<int>& visible_gpu_order) {
+ for (int i = 0; i < visible_gpu_order.size(); ++i) {
+ const int i_gpu_id = visible_gpu_order[i];
+ for (int j = 0; j < visible_gpu_order.size(); ++j) {
+ const int j_gpu_id = visible_gpu_order[j];
+ // We have already validated that ExecutorForDevice() calls
+ // return OK.
+ gpu::StreamExecutor* from =
+ platform->ExecutorForDevice(i_gpu_id).ValueOrDie();
+ gpu::StreamExecutor* to =
+ platform->ExecutorForDevice(j_gpu_id).ValueOrDie();
+
+ if (from->CanEnablePeerAccessTo(to)) {
+ auto status = from->EnablePeerAccessTo(to);
+ if (!status.ok()) {
+ return errors::Internal(status.ToString());
+ }
+ } else {
+ LOG(INFO) << "cannot enable peer access from device ordinal "
+ << i_gpu_id << " to device ordinal " << j_gpu_id;
+ }
+ }
+ }
+ return Status::OK();
+}
+
} // namespace
Status BaseGPUDeviceFactory::GetValidDeviceIds(
const string& visible_device_list, std::vector<int>* ids) {
+ TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
+
gpu::Platform* gpu_manager = GPUMachineManager();
if (gpu_manager == nullptr) {
return Status::OK();
@@ -824,16 +876,6 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
return Status::OK();
}
- auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
- if (cuda_supported_capabilities.empty()) {
- return errors::FailedPrecondition(
- "No supported cuda capabilities in binary.");
- }
- CudaVersion min_supported_capability = *std::min_element(
- cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
-
- int min_gpu_core_count = GetMinGPUMultiprocessorCount(gpu_manager);
-
// If the user wants to remap the visible to virtual GPU mapping,
// check for that here.
std::vector<int> visible_gpu_order;
@@ -863,6 +905,96 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
}
}
+ // Validate no repeats.
+ std::set<int> visible_device_set(visible_gpu_order.begin(),
+ visible_gpu_order.end());
+ if (visible_device_set.size() != visible_gpu_order.size()) {
+ return errors::InvalidArgument(
+ "visible_device_list contained "
+ "a duplicate entry: ",
+ visible_device_list);
+ }
+
+ bool new_gpu_found = false;
+ for (int i = 0; i < visible_gpu_order.size(); ++i) {
+ int gpu_id = visible_gpu_order[i];
+
+ // Only perform this once per visible gpu id.
+ if (visible_gpu_initialized_[gpu_id]) {
+ continue;
+ }
+
+ visible_gpu_initialized_[gpu_id] = true;
+ new_gpu_found = true;
+
+ auto executor = gpu_manager->ExecutorForDevice(gpu_id);
+ if (!executor.ok()) {
+ return StreamExecutorUtil::ConvertStatus(executor.status());
+ }
+
+ auto stream_exec = executor.ValueOrDie();
+ int64 free_bytes;
+ int64 total_bytes;
+ if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
+ // Logs internally on failure.
+ free_bytes = 0;
+ total_bytes = 0;
+ }
+ const auto& description = stream_exec->GetDeviceDescription();
+ int cc_major;
+ int cc_minor;
+ if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
+ // Logs internally on failure.
+ cc_major = 0;
+ cc_minor = 0;
+ }
+ LOG(INFO) << "Found device " << i << " with properties: "
+ << "\nname: " << description.name() << "\nmajor: " << cc_major
+ << " minor: " << cc_minor << " memoryClockRate (GHz) "
+ << description.clock_rate_ghz() << "\npciBusID "
+ << description.pci_bus_id() << "\nTotal memory: "
+ << strings::HumanReadableNumBytes(total_bytes)
+ << "\nFree memory: "
+ << strings::HumanReadableNumBytes(free_bytes);
+ }
+
+ if (new_gpu_found) {
+ // Enable peer access
+ TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
+
+ // Print out a matrix showing which devices can DMA to one
+ // another.
+ auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
+ string line_buf = "DMA: ";
+ for (int i = 0; i < visible_gpu_order.size(); ++i) {
+ strings::StrAppend(&line_buf, visible_gpu_order[i], " ");
+ }
+ LOG(INFO) << line_buf;
+ for (int i = 0; i < visible_gpu_order.size(); ++i) {
+ line_buf = strings::StrCat(visible_gpu_order[i], ": ");
+ for (int j = 0; j < visible_gpu_order.size(); ++j) {
+ if ((*access_map)[{i, j}]) {
+ line_buf.append("Y ");
+ } else {
+ line_buf.append("N ");
+ }
+ }
+ LOG(INFO) << line_buf;
+ }
+ }
+
+ auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
+ if (cuda_supported_capabilities.empty()) {
+ return errors::FailedPrecondition(
+ "No supported cuda capabilities in binary.");
+ }
+ CudaVersion min_supported_capability = *std::min_element(
+ cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
+
+ int min_gpu_core_count =
+ GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
+
+ // Filter out devices that don't have the right capability or power.
for (int i = 0; i < visible_gpu_order.size(); ++i) {
const int32 visible_gpu_id = visible_gpu_order[i];
auto exec_status = gpu_manager->ExecutorForDevice(visible_gpu_id);
@@ -879,7 +1011,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
// Only GPUs with no less than the minimum supported compute capability is
// accepted.
if (device_capability < min_supported_capability) {
- LOG(INFO) << "Ignoring physical gpu device "
+ LOG(INFO) << "Ignoring visible gpu device "
<< "(" << GetShortDeviceDescription(visible_gpu_id, desc)
<< ") "
<< "with Cuda compute capability " << device_capability
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index abe7c0f687..7bd909e9ce 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -129,6 +129,10 @@ class BaseGPUDeviceFactory : public DeviceFactory {
// gpu ids'.
Status GetValidDeviceIds(const string& visible_device_list,
std::vector<int>* ids);
+
+ // visible_gpu_initialized_[gpu_id] is true if visible GPU gpu_id
+ // has been initialized by the process.
+ std::unordered_map<int, bool> visible_gpu_initialized_;
};
} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc
index b0535c7927..aa23e3cc61 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.cc
@@ -19,153 +19,27 @@ limitations under the License.
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/stream_executor_util.h"
namespace gpu = ::perftools::gputools;
namespace tensorflow {
-namespace {
-
-std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(
- gpu::Platform* platform, int device_count) {
- auto* map = new std::map<std::pair<int, int>, bool>;
- for (int i = 0; i < device_count; ++i) {
- for (int j = 0; j < device_count; ++j) {
- gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie();
- gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie();
- (*map)[{i, j}] = from->CanEnablePeerAccessTo(to);
- }
- }
-
- return std::unique_ptr<std::map<std::pair<int, int>, bool>>{map};
-}
-
-Status EnablePeerAccess(gpu::Platform* platform, int device_count) {
- for (int i = 0; i < device_count; ++i) {
- for (int j = 0; j < device_count; ++j) {
- // We have already validated that ExecutorForDevice() calls
- // return OK.
- gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie();
- gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie();
-
- if (from->CanEnablePeerAccessTo(to)) {
- auto status = from->EnablePeerAccessTo(to);
- if (!status.ok()) {
- return errors::Internal(status.ToString());
- }
- } else {
- LOG(INFO) << "cannot enable peer access from device ordinal " << i
- << " to device ordinal " << j;
- }
- }
- }
- return Status::OK();
-}
-
-namespace {
-
-// TODO(vrv): Move this out into a common header so it can be used
-// more widely.
-Status ConvertStatus(const perftools::gputools::port::Status& s) {
- return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
- static_cast<int>(s.code())),
- s.error_message());
-}
-
-} // namespace
-
-static Status InitGPU() {
+Status ValidateGPUMachineManager() {
auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
if (!result.ok()) {
- return ConvertStatus(result.status());
- }
-
- gpu::Platform* platform = result.ValueOrDie();
-
- int dev_count = platform->VisibleDeviceCount();
-
- if (dev_count <= 0) {
- LOG(INFO) << "No GPU devices available on machine.";
- return Status::OK();
- }
-
- for (int i = 0; i < dev_count; ++i) {
- auto executor = platform->ExecutorForDevice(i);
- if (!executor.ok()) {
- return ConvertStatus(executor.status());
- }
-
- auto stream_exec = executor.ValueOrDie();
- int64 free_bytes;
- int64 total_bytes;
- if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
- // Logs internally on failure.
- free_bytes = 0;
- total_bytes = 0;
- }
- const auto& description = stream_exec->GetDeviceDescription();
- int cc_major;
- int cc_minor;
- if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
- // Logs internally on failure.
- cc_major = 0;
- cc_minor = 0;
- }
- LOG(INFO) << "Found device " << i << " with properties: "
- << "\nname: " << description.name() << "\nmajor: " << cc_major
- << " minor: " << cc_minor << " memoryClockRate (GHz) "
- << description.clock_rate_ghz() << "\npciBusID "
- << description.pci_bus_id() << "\nTotal memory: "
- << strings::HumanReadableNumBytes(total_bytes)
- << "\nFree memory: "
- << strings::HumanReadableNumBytes(free_bytes);
- }
-
- // Enable peer access
- TF_RETURN_IF_ERROR(EnablePeerAccess(platform, dev_count));
-
- // Print out a matrix showing which devices can DMA to one
- // another.
- auto access_map = GetPeerAccessMap(platform, dev_count);
- string line_buf = "DMA: ";
- for (int i = 0; i < dev_count; ++i) {
- strings::StrAppend(&line_buf, i, " ");
- }
- LOG(INFO) << line_buf;
- for (int i = 0; i < dev_count; ++i) {
- line_buf = strings::StrCat(i, ": ");
- for (int j = 0; j < dev_count; ++j) {
- if ((*access_map)[{i, j}]) {
- line_buf.append("Y ");
- } else {
- line_buf.append("N ");
- }
- }
- LOG(INFO) << line_buf;
+ return StreamExecutorUtil::ConvertStatus(result.status());
}
return Status::OK();
}
-static Status InitModule() { return InitGPU(); }
-
-} // namespace
-
gpu::Platform* GPUMachineManager() {
- // Create the machine manager singleton and initialize the GPUs only
- // once.
- static Status init = InitModule();
- if (!init.ok()) {
- LOG(WARNING)
- << "Not initializing the GPU, could not create GPU MachineManager. "
- << "Error: " << init;
- return nullptr;
- }
-
auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
if (!result.ok()) {
LOG(FATAL) << "Could not find Platform with name CUDA";
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h
index fdab603eb8..927d05d5ba 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.h
@@ -16,6 +16,8 @@ limitations under the License.
#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_
#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_
+#include "tensorflow/core/lib/core/status.h"
+
namespace perftools {
namespace gputools {
class Platform;
@@ -24,9 +26,14 @@ class Platform;
namespace tensorflow {
+// Initializes the CUDA platform and returns OK if the CUDA
+// platform could be initialized.
+Status ValidateGPUMachineManager();
+
// Returns the GPU machine manager singleton, creating it and
// initializing the GPUs on the machine if needed the first time it is
-// called.
+// called. Must only be called when there is a valid GPU environment
+// in the process (e.g., ValidateGPUMachineManager() returns OK).
perftools::gputools::Platform* GPUMachineManager();
} // namespace tensorflow
diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h
index f8debfe416..6a5ddec04c 100644
--- a/tensorflow/core/util/stream_executor_util.h
+++ b/tensorflow/core/util/stream_executor_util.h
@@ -17,6 +17,8 @@ limitations under the License.
#define TENSORFLOW_CORE_UTIL_STREAM_EXECUTOR_UTIL_H_
#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/stream_executor.h"
namespace tensorflow {
@@ -33,6 +35,16 @@ class StreamExecutorUtil {
return perftools::gputools::DeviceMemory<T>(
perftools::gputools::DeviceMemoryBase(ptr, t.TotalBytes()));
}
+
+ // Converts from a StreamExecutor Status to a TensorFlow Status.
+ //
+ // This assumes that the error codes between the two implementations
+ // match.
+ static Status ConvertStatus(const perftools::gputools::port::Status& s) {
+ return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
+ static_cast<int>(s.code())),
+ s.error_message());
+ }
};
} // namespace tensorflow