tensorflow/core/common_runtime/gpu/gpu_init.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

#include "tensorflow/core/common_runtime/gpu/gpu_init.h"

#include <string>

#include "tensorflow/core/platform/port.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/stream_executor/multi_platform_manager.h"
#include "tensorflow/stream_executor/stream_executor.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/strings/numbers.h"
#include "tensorflow/core/lib/strings/strcat.h"

namespace gpu = ::perftools::gputools;

namespace tensorflow {

namespace {

std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(
    gpu::Platform* platform, int device_count) {
  auto* map = new std::map<std::pair<int, int>, bool>;
  for (int i = 0; i < device_count; ++i) {
    for (int j = 0; j < device_count; ++j) {
      gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie();
      gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie();
      (*map)[{i, j}] = from->CanEnablePeerAccessTo(to);
    }
  }

  return std::unique_ptr<std::map<std::pair<int, int>, bool>>{map};
}

Status EnablePeerAccess(gpu::Platform* platform, int device_count) {
  for (int i = 0; i < device_count; ++i) {
    for (int j = 0; j < device_count; ++j) {
      gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie();
      gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie();

      if (from->CanEnablePeerAccessTo(to)) {
        auto status = from->EnablePeerAccessTo(to);
        if (!status.ok()) {
          return errors::Internal(status.ToString());
        }
      } else {
        LOG(INFO) << "cannot enable peer access from device ordinal " << i
                  << " to device ordinal " << j;
      }
    }
  }
  return Status::OK();
}

static void InitGPU() {
  auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
  if (!result.ok()) {
    LOG(WARNING)
        << "Not initializing the GPU, could not create GPU MachineManager. "
        << "Error: " << result.status();
    return;
  }

  gpu::Platform* platform = result.ValueOrDie();

  int dev_count = platform->VisibleDeviceCount();

  if (dev_count == 0) {
    LOG(INFO) << "No GPU devices available on machine.";
    return;
  }

  for (int i = 0; i < dev_count; ++i) {
    auto stream_exec = platform->ExecutorForDevice(i).ValueOrDie();
    int64 free_bytes;
    int64 total_bytes;
    if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
      // Logs internally on failure.
      free_bytes = 0;
      total_bytes = 0;
    }
    const auto& description = stream_exec->GetDeviceDescription();
    int cc_major;
    int cc_minor;
    if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
      // Logs internally on failure.
      cc_major = 0;
      cc_minor = 0;
    }
    LOG(INFO) << "Found device " << i << " with properties: "
              << "\nname: " << description.name() << "\nmajor: " << cc_major
              << " minor: " << cc_minor << " memoryClockRate (GHz) "
              << description.clock_rate_ghz() << "\npciBusID "
              << description.pci_bus_id() << "\nTotal memory: "
              << strings::HumanReadableNumBytes(total_bytes)
              << "\nFree memory: "
              << strings::HumanReadableNumBytes(free_bytes);
  }

  // Enable peer access

  auto status = EnablePeerAccess(platform, dev_count);
  if (!status.ok()) {
    LOG(FATAL) << "could not enable peer access for GPU devices: " << status;
  }

  // Print out a matrix showing which devices can DMA to one
  // another.
  auto access_map = GetPeerAccessMap(platform, dev_count);
  string line_buf = "DMA: ";
  for (int i = 0; i < dev_count; ++i) {
    strings::StrAppend(&line_buf, i, " ");
  }
  LOG(INFO) << line_buf;
  for (int i = 0; i < dev_count; ++i) {
    line_buf = strings::StrCat(i, ":   ");
    for (int j = 0; j < dev_count; ++j) {
      if ((*access_map)[{i, j}]) {
        line_buf.append("Y ");
      } else {
        line_buf.append("N ");
      }
    }
    LOG(INFO) << line_buf;
  }
}

static bool InitModule() {
  InitGPU();
  return true;
}

}  // namespace

gpu::Platform* GPUMachineManager() {
  // Create the machine manager singleton and initialize the GPUs only
  // once.
  static bool init = InitModule();
  CHECK(init);  // Avoids compiler warning that init is unused.

  auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
  if (!result.ok()) {
    return nullptr;
  }

  return result.ValueOrDie();
}

}  // namespace tensorflow