tensorflow/core/common_runtime/gpu/process_state.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220

#include "tensorflow/core/common_runtime/gpu/process_state.h"

#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
#include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h"
#include "tensorflow/core/common_runtime/gpu/pool_allocator.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/port.h"
#include "tensorflow/stream_executor/multi_platform_manager.h"

#if defined(PLATFORM_GOOGLE)
DEFINE_bool(record_mem_types, false,
            "If true, record attributes of memory allocations and "
            "dyanmically check for appropriate use of registered memory."
            "Should only be true for debugging or diagnosis of "
            "performance issues.");
DEFINE_bool(brain_mem_reg_cuda_dma, true,
            "If true, register CPU RAM used to copy to/from GPU RAM "
            "with the CUDA driver.");
DEFINE_bool(brain_gpu_use_bfc_allocator, false,
            "If true, uses the Best-Fit GPU allocator.");
DEFINE_bool(brain_gpu_region_allocator_debug, false,
            "If true, checks for memory overwrites by writing "
            "distinctive patterns on both ends of allocated memory.");
DEFINE_bool(brain_gpu_region_allocator_reset_to_nan, false,
            "If true, initializes all new Malloc buffers to NaN, "
            "and resets the buffer to NaN upon Free.");

#else
bool FLAGS_record_mem_types = false;
bool FLAGS_brain_mem_reg_cuda_dma = true;
bool FLAGS_brain_gpu_region_allocator_debug = false;
bool FLAGS_brain_gpu_region_allocator_reset_to_nan = false;
bool FLAGS_brain_gpu_use_bfc_allocator = false;
#endif

namespace gpu = ::perftools::gputools;

namespace tensorflow {

ProcessState* ProcessState::instance_ = nullptr;

/*static*/ ProcessState* ProcessState::singleton() {
  if (instance_ == nullptr) {
    instance_ = new ProcessState;
  }

  return instance_;
}

ProcessState::ProcessState() : gpu_count_(0) {
  CHECK(instance_ == nullptr);
  instance_ = this;
}

ProcessState::~ProcessState() {
  for (auto p : gpu_allocators_) {
    delete p;
  }
  instance_ = nullptr;
}

string ProcessState::MemDesc::DebugString() {
  return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, ", dma: ",
                         gpu_registered, ", nic: ", nic_registered);
}

ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
  if (FLAGS_record_mem_types) {
    auto iter = mem_desc_map_.find(ptr);
    if (iter != mem_desc_map_.end()) {
      return iter->second;
    }
  }
  return MemDesc();
}

void ProcessState::SetGPUCount(int c) {
  CHECK(gpu_count_ == 0 || gpu_count_ == c)
      << "Cannot call SetGPUCount with a non-zero value "
      << "not equal to prior set value.";
  gpu_count_ = c;
}

int ProcessState::GPUCount() const { return gpu_count_; }

Allocator* ProcessState::GetGPUAllocator(int gpu_id, size_t total_bytes) {
#if GOOGLE_CUDA
  mutex_lock lock(mu_);
  gpu::Platform* gpu_platform = GPUMachineManager();

  // Verify that gpu_id is legitimate.
  CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount())
      << "gpu_id is outside discovered device range";

  if (gpu_id >= static_cast<int64>(gpu_allocators_.size())) {
    gpu_allocators_.resize(gpu_id + 1);
    if (FLAGS_record_mem_types) gpu_al_.resize(gpu_id + 1);
  }

  if (gpu_allocators_[gpu_id] == nullptr) {
    VisitableAllocator* gpu_allocator;

    if (FLAGS_brain_gpu_use_bfc_allocator) {
      gpu_allocator = new GPUBFCAllocator(gpu_id, total_bytes);
    } else {
      gpu_allocator = new GPURegionAllocator(gpu_id, total_bytes);
    }

    if (FLAGS_brain_gpu_region_allocator_debug) {
      gpu_allocator = new GPUDebugAllocator(gpu_allocator, gpu_id);
    }
    if (FLAGS_brain_gpu_region_allocator_reset_to_nan) {
      gpu_allocator = new GPUNanResetAllocator(gpu_allocator, gpu_id);
    }

    gpu_allocators_[gpu_id] = gpu_allocator;

    // If there are any pending AllocVisitors for this bus, add
    // them now.
    gpu::StreamExecutor* se =
        gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
    int bus_id = se->GetDeviceDescription().numa_node();
    if (bus_id < static_cast<int64>(gpu_visitors_.size())) {
      for (auto v : gpu_visitors_[bus_id]) {
        gpu_allocators_[gpu_id]->AddAllocVisitor(v);
      }
    }
    if (FLAGS_record_mem_types) {
      MemDesc md;
      md.loc = MemDesc::GPU;
      md.dev_index = gpu_id;
      md.gpu_registered = false;
      md.nic_registered = true;
      if (static_cast<int64>(gpu_al_.size()) <= gpu_id)
        gpu_al_.resize(gpu_id + 1);
      gpu_al_[gpu_id] = new internal::RecordingAllocator(
          &mem_desc_map_, gpu_allocators_[gpu_id], md, &mu_);
    }
  }
  if (FLAGS_record_mem_types) return gpu_al_[gpu_id];
  return gpu_allocators_[gpu_id];
#else
  LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda.";
  return nullptr;
#endif  // GOOGLE_CUDA
}

Allocator* ProcessState::GetCPUAllocator(int numa_node) {
  // Although we're temporarily ignoring numa_node, check for legality.
  CHECK_GE(numa_node, 0);
  // TODO(tucker): actually maintain separate CPUAllocators for
  // different numa_nodes.  For now, just one.
  numa_node = 0;
  mutex_lock lock(mu_);
  while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
    cpu_allocators_.push_back(new PoolAllocator(
        100 /*pool_size_limit*/, true /*auto_resize*/, new BasicCPUAllocator(),
        new NoopRounder, "cpu_pool"));
  }
  return cpu_allocators_[0];
}

Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
  if (gpu_count_ == 0 || !FLAGS_brain_mem_reg_cuda_dma) {
    return GetCPUAllocator(numa_node);
  }
  // Although we're temporarily ignoring numa_node, check for legality.
  CHECK_GE(numa_node, 0);
  // TODO(tucker): actually maintain separate CPUAllocators for
  // different numa_nodes.  For now, just one.
  numa_node = 0;
  mutex_lock lock(mu_);
  while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) {
    // CUDAHost alloc the same across all gpus, so just get the
    // executor for the first device.
    gpu::Platform* gpu_platform = GPUMachineManager();
    gpu::StreamExecutor* se = gpu_platform->ExecutorForDevice(0).ValueOrDie();
    CHECK(se);
    cuda_host_allocators_.push_back(new PoolAllocator(
        100 /*pool_size_limit*/, true /*auto_resize*/,
        new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host"));
    if (FLAGS_record_mem_types) {
      MemDesc md;
      md.loc = MemDesc::CPU;
      md.dev_index = 0;
      md.gpu_registered = true;
      md.nic_registered = false;
      cuda_al_.push_back(new internal::RecordingAllocator(
          &mem_desc_map_, cuda_host_allocators_.back(), md, &mu_));
    }
  }
  if (FLAGS_record_mem_types) return cuda_al_[0];
  return cuda_host_allocators_[0];
}

void ProcessState::AddGPUAllocVisitor(int bus_id, AllocVisitor visitor) {
#if GOOGLE_CUDA
  mutex_lock lock(mu_);
  gpu::Platform* gpu_platform = GPUMachineManager();
  for (int gpu_id = 0; gpu_id < static_cast<int64>(gpu_allocators_.size());
       ++gpu_id) {
    gpu::StreamExecutor* se =
        gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
    if (gpu_allocators_[gpu_id] &&
        se->GetDeviceDescription().numa_node() == bus_id) {
      gpu_allocators_[gpu_id]->AddAllocVisitor(visitor);
    }
  }
  while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
    gpu_visitors_.push_back(std::vector<AllocVisitor>());
  }
  gpu_visitors_[bus_id].push_back(visitor);
#endif  // GOOGLE_CUDA
}

}  // namespace tensorflow