TensorFlow: Initial commit of TensorFlow library.

TensorFlow is an open source software library for numerical computation using data flow graphs. Base CL: 107276108
author: Manjunath Kudlur <keveman@gmail.com> 2015-11-06 16:27:58 -0800
committer: Manjunath Kudlur <keveman@gmail.com> 2015-11-06 16:27:58 -0800
commit: f41959ccb2d9d4c722fe8fc3351401d53bcf4900 (patch)
tree: ef0ca22cb2a5ac4bdec9d080d8e0788a53ed496d /tensorflow/core/common_runtime/gpu/process_state.cc
1 files changed, 220 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
new file mode 100644
index 0000000000..70ac6130c2
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -0,0 +1,220 @@
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/pool_allocator.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+
+#if defined(PLATFORM_GOOGLE)
+DEFINE_bool(record_mem_types, false,
+            "If true, record attributes of memory allocations and "
+            "dyanmically check for appropriate use of registered memory."
+            "Should only be true for debugging or diagnosis of "
+            "performance issues.");
+DEFINE_bool(brain_mem_reg_cuda_dma, true,
+            "If true, register CPU RAM used to copy to/from GPU RAM "
+            "with the CUDA driver.");
+DEFINE_bool(brain_gpu_use_bfc_allocator, false,
+            "If true, uses the Best-Fit GPU allocator.");
+DEFINE_bool(brain_gpu_region_allocator_debug, false,
+            "If true, checks for memory overwrites by writing "
+            "distinctive patterns on both ends of allocated memory.");
+DEFINE_bool(brain_gpu_region_allocator_reset_to_nan, false,
+            "If true, initializes all new Malloc buffers to NaN, "
+            "and resets the buffer to NaN upon Free.");
+
+#else
+bool FLAGS_record_mem_types = false;
+bool FLAGS_brain_mem_reg_cuda_dma = true;
+bool FLAGS_brain_gpu_region_allocator_debug = false;
+bool FLAGS_brain_gpu_region_allocator_reset_to_nan = false;
+bool FLAGS_brain_gpu_use_bfc_allocator = false;
+#endif
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+ProcessState* ProcessState::instance_ = nullptr;
+
+/*static*/ ProcessState* ProcessState::singleton() {
+  if (instance_ == nullptr) {
+    instance_ = new ProcessState;
+  }
+
+  return instance_;
+}
+
+ProcessState::ProcessState() : gpu_count_(0) {
+  CHECK(instance_ == nullptr);
+  instance_ = this;
+}
+
+ProcessState::~ProcessState() {
+  for (auto p : gpu_allocators_) {
+    delete p;
+  }
+  instance_ = nullptr;
+}
+
+string ProcessState::MemDesc::DebugString() {
+  return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, ", dma: ",
+                         gpu_registered, ", nic: ", nic_registered);
+}
+
+ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
+  if (FLAGS_record_mem_types) {
+    auto iter = mem_desc_map_.find(ptr);
+    if (iter != mem_desc_map_.end()) {
+      return iter->second;
+    }
+  }
+  return MemDesc();
+}
+
+void ProcessState::SetGPUCount(int c) {
+  CHECK(gpu_count_ == 0 || gpu_count_ == c)
+      << "Cannot call SetGPUCount with a non-zero value "
+      << "not equal to prior set value.";
+  gpu_count_ = c;
+}
+
+int ProcessState::GPUCount() const { return gpu_count_; }
+
+Allocator* ProcessState::GetGPUAllocator(int gpu_id, size_t total_bytes) {
+#if GOOGLE_CUDA
+  mutex_lock lock(mu_);
+  gpu::Platform* gpu_platform = GPUMachineManager();
+
+  // Verify that gpu_id is legitimate.
+  CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount())
+      << "gpu_id is outside discovered device range";
+
+  if (gpu_id >= static_cast<int64>(gpu_allocators_.size())) {
+    gpu_allocators_.resize(gpu_id + 1);
+    if (FLAGS_record_mem_types) gpu_al_.resize(gpu_id + 1);
+  }
+
+  if (gpu_allocators_[gpu_id] == nullptr) {
+    VisitableAllocator* gpu_allocator;
+
+    if (FLAGS_brain_gpu_use_bfc_allocator) {
+      gpu_allocator = new GPUBFCAllocator(gpu_id, total_bytes);
+    } else {
+      gpu_allocator = new GPURegionAllocator(gpu_id, total_bytes);
+    }
+
+    if (FLAGS_brain_gpu_region_allocator_debug) {
+      gpu_allocator = new GPUDebugAllocator(gpu_allocator, gpu_id);
+    }
+    if (FLAGS_brain_gpu_region_allocator_reset_to_nan) {
+      gpu_allocator = new GPUNanResetAllocator(gpu_allocator, gpu_id);
+    }
+
+    gpu_allocators_[gpu_id] = gpu_allocator;
+
+    // If there are any pending AllocVisitors for this bus, add
+    // them now.
+    gpu::StreamExecutor* se =
+        gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+    int bus_id = se->GetDeviceDescription().numa_node();
+    if (bus_id < static_cast<int64>(gpu_visitors_.size())) {
+      for (auto v : gpu_visitors_[bus_id]) {
+        gpu_allocators_[gpu_id]->AddAllocVisitor(v);
+      }
+    }
+    if (FLAGS_record_mem_types) {
+      MemDesc md;
+      md.loc = MemDesc::GPU;
+      md.dev_index = gpu_id;
+      md.gpu_registered = false;
+      md.nic_registered = true;
+      if (static_cast<int64>(gpu_al_.size()) <= gpu_id)
+        gpu_al_.resize(gpu_id + 1);
+      gpu_al_[gpu_id] = new internal::RecordingAllocator(
+          &mem_desc_map_, gpu_allocators_[gpu_id], md, &mu_);
+    }
+  }
+  if (FLAGS_record_mem_types) return gpu_al_[gpu_id];
+  return gpu_allocators_[gpu_id];
+#else
+  LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda.";
+  return nullptr;
+#endif  // GOOGLE_CUDA
+}
+
+Allocator* ProcessState::GetCPUAllocator(int numa_node) {
+  // Although we're temporarily ignoring numa_node, check for legality.
+  CHECK_GE(numa_node, 0);
+  // TODO(tucker): actually maintain separate CPUAllocators for
+  // different numa_nodes.  For now, just one.
+  numa_node = 0;
+  mutex_lock lock(mu_);
+  while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
+    cpu_allocators_.push_back(new PoolAllocator(
+        100 /*pool_size_limit*/, true /*auto_resize*/, new BasicCPUAllocator(),
+        new NoopRounder, "cpu_pool"));
+  }
+  return cpu_allocators_[0];
+}
+
+Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
+  if (gpu_count_ == 0 || !FLAGS_brain_mem_reg_cuda_dma) {
+    return GetCPUAllocator(numa_node);
+  }
+  // Although we're temporarily ignoring numa_node, check for legality.
+  CHECK_GE(numa_node, 0);
+  // TODO(tucker): actually maintain separate CPUAllocators for
+  // different numa_nodes.  For now, just one.
+  numa_node = 0;
+  mutex_lock lock(mu_);
+  while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) {
+    // CUDAHost alloc the same across all gpus, so just get the
+    // executor for the first device.
+    gpu::Platform* gpu_platform = GPUMachineManager();
+    gpu::StreamExecutor* se = gpu_platform->ExecutorForDevice(0).ValueOrDie();
+    CHECK(se);
+    cuda_host_allocators_.push_back(new PoolAllocator(
+        100 /*pool_size_limit*/, true /*auto_resize*/,
+        new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host"));
+    if (FLAGS_record_mem_types) {
+      MemDesc md;
+      md.loc = MemDesc::CPU;
+      md.dev_index = 0;
+      md.gpu_registered = true;
+      md.nic_registered = false;
+      cuda_al_.push_back(new internal::RecordingAllocator(
+          &mem_desc_map_, cuda_host_allocators_.back(), md, &mu_));
+    }
+  }
+  if (FLAGS_record_mem_types) return cuda_al_[0];
+  return cuda_host_allocators_[0];
+}
+
+void ProcessState::AddGPUAllocVisitor(int bus_id, AllocVisitor visitor) {
+#if GOOGLE_CUDA
+  mutex_lock lock(mu_);
+  gpu::Platform* gpu_platform = GPUMachineManager();
+  for (int gpu_id = 0; gpu_id < static_cast<int64>(gpu_allocators_.size());
+       ++gpu_id) {
+    gpu::StreamExecutor* se =
+        gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+    if (gpu_allocators_[gpu_id] &&
+        se->GetDeviceDescription().numa_node() == bus_id) {
+      gpu_allocators_[gpu_id]->AddAllocVisitor(visitor);
+    }
+  }
+  while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
+    gpu_visitors_.push_back(std::vector<AllocVisitor>());
+  }
+  gpu_visitors_[bus_id].push_back(visitor);
+#endif  // GOOGLE_CUDA
+}
+
+}  // namespace tensorflow
author	Manjunath Kudlur <keveman@gmail.com>	2015-11-06 16:27:58 -0800
committer	Manjunath Kudlur <keveman@gmail.com>	2015-11-06 16:27:58 -0800
commit	f41959ccb2d9d4c722fe8fc3351401d53bcf4900 (patch)
tree	ef0ca22cb2a5ac4bdec9d080d8e0788a53ed496d /tensorflow/core/common_runtime/gpu/process_state.cc