aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/common_runtime/gpu/process_state.cc
diff options
context:
space:
mode:
authorGravatar Manjunath Kudlur <keveman@gmail.com>2015-11-06 16:27:58 -0800
committerGravatar Manjunath Kudlur <keveman@gmail.com>2015-11-06 16:27:58 -0800
commitf41959ccb2d9d4c722fe8fc3351401d53bcf4900 (patch)
treeef0ca22cb2a5ac4bdec9d080d8e0788a53ed496d /tensorflow/core/common_runtime/gpu/process_state.cc
TensorFlow: Initial commit of TensorFlow library.
TensorFlow is an open source software library for numerical computation using data flow graphs. Base CL: 107276108
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/process_state.cc')
-rw-r--r--tensorflow/core/common_runtime/gpu/process_state.cc220
1 files changed, 220 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
new file mode 100644
index 0000000000..70ac6130c2
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -0,0 +1,220 @@
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/pool_allocator.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+
+#if defined(PLATFORM_GOOGLE)
+DEFINE_bool(record_mem_types, false,
+ "If true, record attributes of memory allocations and "
+ "dyanmically check for appropriate use of registered memory."
+ "Should only be true for debugging or diagnosis of "
+ "performance issues.");
+DEFINE_bool(brain_mem_reg_cuda_dma, true,
+ "If true, register CPU RAM used to copy to/from GPU RAM "
+ "with the CUDA driver.");
+DEFINE_bool(brain_gpu_use_bfc_allocator, false,
+ "If true, uses the Best-Fit GPU allocator.");
+DEFINE_bool(brain_gpu_region_allocator_debug, false,
+ "If true, checks for memory overwrites by writing "
+ "distinctive patterns on both ends of allocated memory.");
+DEFINE_bool(brain_gpu_region_allocator_reset_to_nan, false,
+ "If true, initializes all new Malloc buffers to NaN, "
+ "and resets the buffer to NaN upon Free.");
+
+#else
+bool FLAGS_record_mem_types = false;
+bool FLAGS_brain_mem_reg_cuda_dma = true;
+bool FLAGS_brain_gpu_region_allocator_debug = false;
+bool FLAGS_brain_gpu_region_allocator_reset_to_nan = false;
+bool FLAGS_brain_gpu_use_bfc_allocator = false;
+#endif
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+ProcessState* ProcessState::instance_ = nullptr;
+
+/*static*/ ProcessState* ProcessState::singleton() {
+ if (instance_ == nullptr) {
+ instance_ = new ProcessState;
+ }
+
+ return instance_;
+}
+
+ProcessState::ProcessState() : gpu_count_(0) {
+ CHECK(instance_ == nullptr);
+ instance_ = this;
+}
+
+ProcessState::~ProcessState() {
+ for (auto p : gpu_allocators_) {
+ delete p;
+ }
+ instance_ = nullptr;
+}
+
+string ProcessState::MemDesc::DebugString() {
+ return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, ", dma: ",
+ gpu_registered, ", nic: ", nic_registered);
+}
+
+ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
+ if (FLAGS_record_mem_types) {
+ auto iter = mem_desc_map_.find(ptr);
+ if (iter != mem_desc_map_.end()) {
+ return iter->second;
+ }
+ }
+ return MemDesc();
+}
+
+void ProcessState::SetGPUCount(int c) {
+ CHECK(gpu_count_ == 0 || gpu_count_ == c)
+ << "Cannot call SetGPUCount with a non-zero value "
+ << "not equal to prior set value.";
+ gpu_count_ = c;
+}
+
+int ProcessState::GPUCount() const { return gpu_count_; }
+
+Allocator* ProcessState::GetGPUAllocator(int gpu_id, size_t total_bytes) {
+#if GOOGLE_CUDA
+ mutex_lock lock(mu_);
+ gpu::Platform* gpu_platform = GPUMachineManager();
+
+ // Verify that gpu_id is legitimate.
+ CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount())
+ << "gpu_id is outside discovered device range";
+
+ if (gpu_id >= static_cast<int64>(gpu_allocators_.size())) {
+ gpu_allocators_.resize(gpu_id + 1);
+ if (FLAGS_record_mem_types) gpu_al_.resize(gpu_id + 1);
+ }
+
+ if (gpu_allocators_[gpu_id] == nullptr) {
+ VisitableAllocator* gpu_allocator;
+
+ if (FLAGS_brain_gpu_use_bfc_allocator) {
+ gpu_allocator = new GPUBFCAllocator(gpu_id, total_bytes);
+ } else {
+ gpu_allocator = new GPURegionAllocator(gpu_id, total_bytes);
+ }
+
+ if (FLAGS_brain_gpu_region_allocator_debug) {
+ gpu_allocator = new GPUDebugAllocator(gpu_allocator, gpu_id);
+ }
+ if (FLAGS_brain_gpu_region_allocator_reset_to_nan) {
+ gpu_allocator = new GPUNanResetAllocator(gpu_allocator, gpu_id);
+ }
+
+ gpu_allocators_[gpu_id] = gpu_allocator;
+
+ // If there are any pending AllocVisitors for this bus, add
+ // them now.
+ gpu::StreamExecutor* se =
+ gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+ int bus_id = se->GetDeviceDescription().numa_node();
+ if (bus_id < static_cast<int64>(gpu_visitors_.size())) {
+ for (auto v : gpu_visitors_[bus_id]) {
+ gpu_allocators_[gpu_id]->AddAllocVisitor(v);
+ }
+ }
+ if (FLAGS_record_mem_types) {
+ MemDesc md;
+ md.loc = MemDesc::GPU;
+ md.dev_index = gpu_id;
+ md.gpu_registered = false;
+ md.nic_registered = true;
+ if (static_cast<int64>(gpu_al_.size()) <= gpu_id)
+ gpu_al_.resize(gpu_id + 1);
+ gpu_al_[gpu_id] = new internal::RecordingAllocator(
+ &mem_desc_map_, gpu_allocators_[gpu_id], md, &mu_);
+ }
+ }
+ if (FLAGS_record_mem_types) return gpu_al_[gpu_id];
+ return gpu_allocators_[gpu_id];
+#else
+ LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda.";
+ return nullptr;
+#endif // GOOGLE_CUDA
+}
+
+Allocator* ProcessState::GetCPUAllocator(int numa_node) {
+ // Although we're temporarily ignoring numa_node, check for legality.
+ CHECK_GE(numa_node, 0);
+ // TODO(tucker): actually maintain separate CPUAllocators for
+ // different numa_nodes. For now, just one.
+ numa_node = 0;
+ mutex_lock lock(mu_);
+ while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
+ cpu_allocators_.push_back(new PoolAllocator(
+ 100 /*pool_size_limit*/, true /*auto_resize*/, new BasicCPUAllocator(),
+ new NoopRounder, "cpu_pool"));
+ }
+ return cpu_allocators_[0];
+}
+
+Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
+ if (gpu_count_ == 0 || !FLAGS_brain_mem_reg_cuda_dma) {
+ return GetCPUAllocator(numa_node);
+ }
+ // Although we're temporarily ignoring numa_node, check for legality.
+ CHECK_GE(numa_node, 0);
+ // TODO(tucker): actually maintain separate CPUAllocators for
+ // different numa_nodes. For now, just one.
+ numa_node = 0;
+ mutex_lock lock(mu_);
+ while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) {
+ // CUDAHost alloc the same across all gpus, so just get the
+ // executor for the first device.
+ gpu::Platform* gpu_platform = GPUMachineManager();
+ gpu::StreamExecutor* se = gpu_platform->ExecutorForDevice(0).ValueOrDie();
+ CHECK(se);
+ cuda_host_allocators_.push_back(new PoolAllocator(
+ 100 /*pool_size_limit*/, true /*auto_resize*/,
+ new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host"));
+ if (FLAGS_record_mem_types) {
+ MemDesc md;
+ md.loc = MemDesc::CPU;
+ md.dev_index = 0;
+ md.gpu_registered = true;
+ md.nic_registered = false;
+ cuda_al_.push_back(new internal::RecordingAllocator(
+ &mem_desc_map_, cuda_host_allocators_.back(), md, &mu_));
+ }
+ }
+ if (FLAGS_record_mem_types) return cuda_al_[0];
+ return cuda_host_allocators_[0];
+}
+
+void ProcessState::AddGPUAllocVisitor(int bus_id, AllocVisitor visitor) {
+#if GOOGLE_CUDA
+ mutex_lock lock(mu_);
+ gpu::Platform* gpu_platform = GPUMachineManager();
+ for (int gpu_id = 0; gpu_id < static_cast<int64>(gpu_allocators_.size());
+ ++gpu_id) {
+ gpu::StreamExecutor* se =
+ gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+ if (gpu_allocators_[gpu_id] &&
+ se->GetDeviceDescription().numa_node() == bus_id) {
+ gpu_allocators_[gpu_id]->AddAllocVisitor(visitor);
+ }
+ }
+ while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
+ gpu_visitors_.push_back(std::vector<AllocVisitor>());
+ }
+ gpu_visitors_[bus_id].push_back(visitor);
+#endif // GOOGLE_CUDA
+}
+
+} // namespace tensorflow