diff options
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_device.cc | 166 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_device.h | 4 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_init.cc | 134 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_init.h | 9 | ||||
-rw-r--r-- | tensorflow/core/util/stream_executor_util.h | 12 |
5 files changed, 177 insertions, 148 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index 033d67772c..d644dfa037 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -54,6 +54,7 @@ limitations under the License. #include "tensorflow/core/platform/types.h" #include "tensorflow/core/public/session_options.h" #include "tensorflow/core/util/device_name_utils.h" +#include "tensorflow/core/util/stream_executor_util.h" namespace gpu = ::perftools::gputools; @@ -710,13 +711,14 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options, return Status::OK(); } -static int GetDefaultMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) { +static int GetDefaultMinGPUMultiprocessorCount( + gpu::Platform* gpu_manager, const std::vector<int>& visible_gpu_order) { static const int kDefaultMinGPUMultiprocessorCount = 8; // Find the highest multi-processor count across all visible GPUs. int max_count = -1; - for (int i = 0; i < gpu_manager->VisibleDeviceCount(); ++i) { - auto exec_status = gpu_manager->ExecutorForDevice(i); + for (int i = 0; i < visible_gpu_order.size(); ++i) { + auto exec_status = gpu_manager->ExecutorForDevice(visible_gpu_order[i]); if (!exec_status.ok()) { continue; } @@ -733,12 +735,13 @@ static int GetDefaultMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) { } } -static int GetMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) { +static int GetMinGPUMultiprocessorCount( + gpu::Platform* gpu_manager, const std::vector<int>& visible_gpu_order) { const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT"); if (tf_min_gpu_core_count == nullptr || strcmp(tf_min_gpu_core_count, "") == 0) { - return GetDefaultMinGPUMultiprocessorCount(gpu_manager); + return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); } int min_gpu_core_count = -1; @@ -748,7 +751,8 @@ static int GetMinGPUMultiprocessorCount(gpu::Platform* gpu_manager) { } } - int count = GetDefaultMinGPUMultiprocessorCount(gpu_manager); + int count = + GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); LOG(ERROR) << "Invalid minimum GPU multiprocessor count: [" << tf_min_gpu_core_count << "]. " << "Using the default value: " << count; @@ -810,10 +814,58 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() { return cuda_caps; } +std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap( + gpu::Platform* platform, const std::vector<int>& visible_gpu_order) { + std::unique_ptr<std::map<std::pair<int, int>, bool>> map( + new std::map<std::pair<int, int>, bool>); + for (int i = 0; i < visible_gpu_order.size(); ++i) { + const int i_gpu_id = visible_gpu_order[i]; + for (int j = 0; j < visible_gpu_order.size(); ++j) { + const int j_gpu_id = visible_gpu_order[j]; + gpu::StreamExecutor* from = + platform->ExecutorForDevice(i_gpu_id).ValueOrDie(); + gpu::StreamExecutor* to = + platform->ExecutorForDevice(j_gpu_id).ValueOrDie(); + (*map)[{i, j}] = from->CanEnablePeerAccessTo(to); + } + } + + return map; +} + +Status EnablePeerAccess(gpu::Platform* platform, + const std::vector<int>& visible_gpu_order) { + for (int i = 0; i < visible_gpu_order.size(); ++i) { + const int i_gpu_id = visible_gpu_order[i]; + for (int j = 0; j < visible_gpu_order.size(); ++j) { + const int j_gpu_id = visible_gpu_order[j]; + // We have already validated that ExecutorForDevice() calls + // return OK. + gpu::StreamExecutor* from = + platform->ExecutorForDevice(i_gpu_id).ValueOrDie(); + gpu::StreamExecutor* to = + platform->ExecutorForDevice(j_gpu_id).ValueOrDie(); + + if (from->CanEnablePeerAccessTo(to)) { + auto status = from->EnablePeerAccessTo(to); + if (!status.ok()) { + return errors::Internal(status.ToString()); + } + } else { + LOG(INFO) << "cannot enable peer access from device ordinal " + << i_gpu_id << " to device ordinal " << j_gpu_id; + } + } + } + return Status::OK(); +} + } // namespace Status BaseGPUDeviceFactory::GetValidDeviceIds( const string& visible_device_list, std::vector<int>* ids) { + TF_RETURN_IF_ERROR(ValidateGPUMachineManager()); + gpu::Platform* gpu_manager = GPUMachineManager(); if (gpu_manager == nullptr) { return Status::OK(); @@ -824,16 +876,6 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds( return Status::OK(); } - auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities(); - if (cuda_supported_capabilities.empty()) { - return errors::FailedPrecondition( - "No supported cuda capabilities in binary."); - } - CudaVersion min_supported_capability = *std::min_element( - cuda_supported_capabilities.begin(), cuda_supported_capabilities.end()); - - int min_gpu_core_count = GetMinGPUMultiprocessorCount(gpu_manager); - // If the user wants to remap the visible to virtual GPU mapping, // check for that here. std::vector<int> visible_gpu_order; @@ -863,6 +905,96 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds( } } + // Validate no repeats. + std::set<int> visible_device_set(visible_gpu_order.begin(), + visible_gpu_order.end()); + if (visible_device_set.size() != visible_gpu_order.size()) { + return errors::InvalidArgument( + "visible_device_list contained " + "a duplicate entry: ", + visible_device_list); + } + + bool new_gpu_found = false; + for (int i = 0; i < visible_gpu_order.size(); ++i) { + int gpu_id = visible_gpu_order[i]; + + // Only perform this once per visible gpu id. + if (visible_gpu_initialized_[gpu_id]) { + continue; + } + + visible_gpu_initialized_[gpu_id] = true; + new_gpu_found = true; + + auto executor = gpu_manager->ExecutorForDevice(gpu_id); + if (!executor.ok()) { + return StreamExecutorUtil::ConvertStatus(executor.status()); + } + + auto stream_exec = executor.ValueOrDie(); + int64 free_bytes; + int64 total_bytes; + if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) { + // Logs internally on failure. + free_bytes = 0; + total_bytes = 0; + } + const auto& description = stream_exec->GetDeviceDescription(); + int cc_major; + int cc_minor; + if (!description.cuda_compute_capability(&cc_major, &cc_minor)) { + // Logs internally on failure. + cc_major = 0; + cc_minor = 0; + } + LOG(INFO) << "Found device " << i << " with properties: " + << "\nname: " << description.name() << "\nmajor: " << cc_major + << " minor: " << cc_minor << " memoryClockRate (GHz) " + << description.clock_rate_ghz() << "\npciBusID " + << description.pci_bus_id() << "\nTotal memory: " + << strings::HumanReadableNumBytes(total_bytes) + << "\nFree memory: " + << strings::HumanReadableNumBytes(free_bytes); + } + + if (new_gpu_found) { + // Enable peer access + TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order)); + + // Print out a matrix showing which devices can DMA to one + // another. + auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order); + string line_buf = "DMA: "; + for (int i = 0; i < visible_gpu_order.size(); ++i) { + strings::StrAppend(&line_buf, visible_gpu_order[i], " "); + } + LOG(INFO) << line_buf; + for (int i = 0; i < visible_gpu_order.size(); ++i) { + line_buf = strings::StrCat(visible_gpu_order[i], ": "); + for (int j = 0; j < visible_gpu_order.size(); ++j) { + if ((*access_map)[{i, j}]) { + line_buf.append("Y "); + } else { + line_buf.append("N "); + } + } + LOG(INFO) << line_buf; + } + } + + auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities(); + if (cuda_supported_capabilities.empty()) { + return errors::FailedPrecondition( + "No supported cuda capabilities in binary."); + } + CudaVersion min_supported_capability = *std::min_element( + cuda_supported_capabilities.begin(), cuda_supported_capabilities.end()); + + int min_gpu_core_count = + GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); + + // Filter out devices that don't have the right capability or power. for (int i = 0; i < visible_gpu_order.size(); ++i) { const int32 visible_gpu_id = visible_gpu_order[i]; auto exec_status = gpu_manager->ExecutorForDevice(visible_gpu_id); @@ -879,7 +1011,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds( // Only GPUs with no less than the minimum supported compute capability is // accepted. if (device_capability < min_supported_capability) { - LOG(INFO) << "Ignoring physical gpu device " + LOG(INFO) << "Ignoring visible gpu device " << "(" << GetShortDeviceDescription(visible_gpu_id, desc) << ") " << "with Cuda compute capability " << device_capability diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h index abe7c0f687..7bd909e9ce 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.h +++ b/tensorflow/core/common_runtime/gpu/gpu_device.h @@ -129,6 +129,10 @@ class BaseGPUDeviceFactory : public DeviceFactory { // gpu ids'. Status GetValidDeviceIds(const string& visible_device_list, std::vector<int>* ids); + + // visible_gpu_initialized_[gpu_id] is true if visible GPU gpu_id + // has been initialized by the process. + std::unordered_map<int, bool> visible_gpu_initialized_; }; } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc index b0535c7927..aa23e3cc61 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_init.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_init.cc @@ -19,153 +19,27 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/stream_executor_util.h" namespace gpu = ::perftools::gputools; namespace tensorflow { -namespace { - -std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap( - gpu::Platform* platform, int device_count) { - auto* map = new std::map<std::pair<int, int>, bool>; - for (int i = 0; i < device_count; ++i) { - for (int j = 0; j < device_count; ++j) { - gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie(); - gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie(); - (*map)[{i, j}] = from->CanEnablePeerAccessTo(to); - } - } - - return std::unique_ptr<std::map<std::pair<int, int>, bool>>{map}; -} - -Status EnablePeerAccess(gpu::Platform* platform, int device_count) { - for (int i = 0; i < device_count; ++i) { - for (int j = 0; j < device_count; ++j) { - // We have already validated that ExecutorForDevice() calls - // return OK. - gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie(); - gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie(); - - if (from->CanEnablePeerAccessTo(to)) { - auto status = from->EnablePeerAccessTo(to); - if (!status.ok()) { - return errors::Internal(status.ToString()); - } - } else { - LOG(INFO) << "cannot enable peer access from device ordinal " << i - << " to device ordinal " << j; - } - } - } - return Status::OK(); -} - -namespace { - -// TODO(vrv): Move this out into a common header so it can be used -// more widely. -Status ConvertStatus(const perftools::gputools::port::Status& s) { - return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>( - static_cast<int>(s.code())), - s.error_message()); -} - -} // namespace - -static Status InitGPU() { +Status ValidateGPUMachineManager() { auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA"); if (!result.ok()) { - return ConvertStatus(result.status()); - } - - gpu::Platform* platform = result.ValueOrDie(); - - int dev_count = platform->VisibleDeviceCount(); - - if (dev_count <= 0) { - LOG(INFO) << "No GPU devices available on machine."; - return Status::OK(); - } - - for (int i = 0; i < dev_count; ++i) { - auto executor = platform->ExecutorForDevice(i); - if (!executor.ok()) { - return ConvertStatus(executor.status()); - } - - auto stream_exec = executor.ValueOrDie(); - int64 free_bytes; - int64 total_bytes; - if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) { - // Logs internally on failure. - free_bytes = 0; - total_bytes = 0; - } - const auto& description = stream_exec->GetDeviceDescription(); - int cc_major; - int cc_minor; - if (!description.cuda_compute_capability(&cc_major, &cc_minor)) { - // Logs internally on failure. - cc_major = 0; - cc_minor = 0; - } - LOG(INFO) << "Found device " << i << " with properties: " - << "\nname: " << description.name() << "\nmajor: " << cc_major - << " minor: " << cc_minor << " memoryClockRate (GHz) " - << description.clock_rate_ghz() << "\npciBusID " - << description.pci_bus_id() << "\nTotal memory: " - << strings::HumanReadableNumBytes(total_bytes) - << "\nFree memory: " - << strings::HumanReadableNumBytes(free_bytes); - } - - // Enable peer access - TF_RETURN_IF_ERROR(EnablePeerAccess(platform, dev_count)); - - // Print out a matrix showing which devices can DMA to one - // another. - auto access_map = GetPeerAccessMap(platform, dev_count); - string line_buf = "DMA: "; - for (int i = 0; i < dev_count; ++i) { - strings::StrAppend(&line_buf, i, " "); - } - LOG(INFO) << line_buf; - for (int i = 0; i < dev_count; ++i) { - line_buf = strings::StrCat(i, ": "); - for (int j = 0; j < dev_count; ++j) { - if ((*access_map)[{i, j}]) { - line_buf.append("Y "); - } else { - line_buf.append("N "); - } - } - LOG(INFO) << line_buf; + return StreamExecutorUtil::ConvertStatus(result.status()); } return Status::OK(); } -static Status InitModule() { return InitGPU(); } - -} // namespace - gpu::Platform* GPUMachineManager() { - // Create the machine manager singleton and initialize the GPUs only - // once. - static Status init = InitModule(); - if (!init.ok()) { - LOG(WARNING) - << "Not initializing the GPU, could not create GPU MachineManager. " - << "Error: " << init; - return nullptr; - } - auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA"); if (!result.ok()) { LOG(FATAL) << "Could not find Platform with name CUDA"; diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h index fdab603eb8..927d05d5ba 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_init.h +++ b/tensorflow/core/common_runtime/gpu/gpu_init.h @@ -16,6 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_ #define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_ +#include "tensorflow/core/lib/core/status.h" + namespace perftools { namespace gputools { class Platform; @@ -24,9 +26,14 @@ class Platform; namespace tensorflow { +// Initializes the CUDA platform and returns OK if the CUDA +// platform could be initialized. +Status ValidateGPUMachineManager(); + // Returns the GPU machine manager singleton, creating it and // initializing the GPUs on the machine if needed the first time it is -// called. +// called. Must only be called when there is a valid GPU environment +// in the process (e.g., ValidateGPUMachineManager() returns OK). perftools::gputools::Platform* GPUMachineManager(); } // namespace tensorflow diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h index f8debfe416..6a5ddec04c 100644 --- a/tensorflow/core/util/stream_executor_util.h +++ b/tensorflow/core/util/stream_executor_util.h @@ -17,6 +17,8 @@ limitations under the License. #define TENSORFLOW_CORE_UTIL_STREAM_EXECUTOR_UTIL_H_ #include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/stream_executor.h" namespace tensorflow { @@ -33,6 +35,16 @@ class StreamExecutorUtil { return perftools::gputools::DeviceMemory<T>( perftools::gputools::DeviceMemoryBase(ptr, t.TotalBytes())); } + + // Converts from a StreamExecutor Status to a TensorFlow Status. + // + // This assumes that the error codes between the two implementations + // match. + static Status ConvertStatus(const perftools::gputools::port::Status& s) { + return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>( + static_cast<int>(s.code())), + s.error_message()); + } }; } // namespace tensorflow |