aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc')
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc371
1 files changed, 371 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc
new file mode 100644
index 0000000000..08ff55e221
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc
@@ -0,0 +1,371 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h"
+
+//#include "base/commandlineflags.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+
+#if defined(PLATFORM_GOOGLE)
+DEFINE_bool(brain_gpu_region_allocator_heap_check_on_destruction, true,
+ "If true, the CUDA gpu manager checks that all allocated "
+ "memory through the GPU memory pool implementation has been "
+ "freed.");
+
+DEFINE_int64(brain_gpu_region_allocator_region_size, 0,
+ "If > 0, sets the default chunk-size allocatable from GPU memory. "
+ "Else defaults to entire GPU memory.");
+
+#else
+bool FLAGS_brain_gpu_region_allocator_heap_check_on_destruction = true;
+tensorflow::int64 FLAGS_brain_gpu_region_allocator_region_size = 0;
+#endif
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+GPURegionAllocator::GPURegionAllocator(int device_id, size_t total_bytes)
+ : device_id_(device_id), total_bytes_(total_bytes) {
+ // Get a pointer to the stream_executor for this device
+ stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+
+ // Set the region size based on explicit user request, or based on
+ // total GPU capacity.
+ if (FLAGS_brain_gpu_region_allocator_region_size > 0) {
+ region_size_ = FLAGS_brain_gpu_region_allocator_region_size;
+ } else {
+ region_size_ = static_cast<size_t>(total_bytes_);
+ }
+
+ LOG(INFO) << "Setting region size to " << region_size_;
+}
+
+GPURegionAllocator::~GPURegionAllocator() {
+ if (FLAGS_brain_gpu_region_allocator_heap_check_on_destruction) {
+ CheckForMemoryLeaks();
+ }
+
+ gtl::STLDeleteValues(&chunk_map_);
+
+ for (auto r : regions_) {
+ gpu::DeviceMemoryBase gpu_ptr{r->ptr};
+ stream_exec_->Deallocate(&gpu_ptr);
+ delete r;
+ }
+}
+
+void* GPURegionAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
+ static const int64 kMaxMillisToWait = 10000; // 10 seconds
+ return retry_helper_.AllocateRaw(
+ [this](size_t a, size_t nb, bool v) {
+ return AllocateRawInternal(a, nb, v);
+ },
+ kMaxMillisToWait, alignment, num_bytes);
+}
+
+void* GPURegionAllocator::AllocateRawInternal(size_t alignment,
+ size_t num_bytes,
+ bool dump_log_on_failure) {
+ if (num_bytes == 0) {
+ LOG(ERROR) << "tried to allocate 0 bytes";
+ return nullptr;
+ }
+ size_t chunk_size = ChunkSize(num_bytes);
+
+ VLOG(2) << "chunk_size " << chunk_size << " from num_bytes "
+ << strings::HumanReadableNumBytes(num_bytes);
+ mutex_lock l(lock_);
+ Pool* pool = &pools_[chunk_size];
+ if (pool->num_free == 0) {
+ if (!ExpandPool(pool, chunk_size, num_bytes, dump_log_on_failure)) {
+ if (dump_log_on_failure) {
+ LOG(WARNING) << "Out of GPU memory, see memory state dump above";
+ }
+ return nullptr;
+ }
+ }
+ CHECK_LT(0, pool->num_free);
+ CHECK(pool->first);
+ CHECK(pool->last);
+ Chunk* c = pool->first;
+ CHECK(c);
+ CHECK(!c->in_use);
+
+ c->in_use = true;
+ // Move c to the back of the queue.
+ if (c->next != nullptr) {
+ pool->first = c->next;
+ pool->first->prev = nullptr;
+ c->next = nullptr;
+ }
+
+ if (pool->last != c) {
+ pool->last->next = c;
+ c->prev = pool->last;
+ pool->last = c;
+ }
+ pool->num_free--;
+ pool->cumulative_malloced++;
+
+ void* rv = c->ptr;
+ c->bytes_allocated = num_bytes;
+
+ VLOG(2) << "new ptr " << rv;
+ return rv;
+}
+
+void GPURegionAllocator::DeallocateRaw(void* ptr) {
+ retry_helper_.DeallocateRaw([this](void* p) { DeallocateRawInternal(p); },
+ ptr);
+}
+
+void GPURegionAllocator::DeallocateRawInternal(void* ptr) {
+ VLOG(2) << "DeallocateRaw: " << ptr;
+ if (ptr == nullptr) {
+ LOG(ERROR) << "tried to deallocate nullptr";
+ return;
+ }
+
+ mutex_lock l(lock_);
+ ChunkMap::const_iterator iter = chunk_map_.find(ptr);
+ CHECK(iter != chunk_map_.end());
+
+ Chunk* c = iter->second;
+ VLOG(2) << "chunk of size " << c->size << " at " << c;
+
+ Pool* pool = &(pools_[c->size]);
+ // Move chunk to head of queue, and mark free.
+ DCHECK(c->in_use);
+ c->in_use = false;
+ if (c->prev) c->prev->next = c->next;
+ if (c->next) c->next->prev = c->prev;
+ if (pool->first == c) pool->first = c->next;
+ if (pool->last == c) pool->last = c->prev;
+ c->next = pool->first;
+ c->prev = nullptr;
+ if (c->next) c->next->prev = c;
+ pool->first = c;
+ if (pool->last == nullptr) pool->last = c;
+ pool->num_free++;
+ pool->cumulative_freed++;
+}
+
+bool GPURegionAllocator::ExpandPool(Pool* pool, size_t chunk_size,
+ size_t requested_size,
+ bool dump_log_on_failure) {
+ VLOG(1) << "ExpandPool of " << chunk_size << " from " << pool->num_chunks
+ << " current members";
+ DCHECK_NE(0, chunk_size);
+ // If chunk_size is < 4096, double the pool size. Otherwise
+ // just increase by one.
+ int num_chunks = pool->num_chunks;
+ if (num_chunks == 0) {
+ if (chunk_size > 4096) {
+ num_chunks = 1;
+ } else {
+ num_chunks = 4096 / chunk_size;
+ }
+ }
+ // For larger chunks, limit the amount of expansion.
+ size_t aggregate_size = num_chunks * chunk_size;
+ if (aggregate_size > (1 << 20)) {
+ num_chunks = static_cast<int>(
+ std::max(static_cast<size_t>(1), (1 << 20) / chunk_size));
+ }
+ while (num_chunks > 0) {
+ Region* r = (regions_.empty() ? nullptr : regions_.back());
+ if (r == nullptr ||
+ (((r->ptr + r->size) - r->next) < static_cast<int64>(chunk_size))) {
+ // Current region is not large enough to accommodate another chunk.
+ while (r == nullptr || (((r->ptr + r->size) - r->next) <
+ static_cast<int64>(chunk_size))) {
+ // Get another region.
+ size_t this_region_size = std::max(region_size_, chunk_size);
+
+ // Check if we would exceed our limit.
+ if (allocated_memory_ + this_region_size > total_bytes_) {
+ if (dump_log_on_failure) DumpMemoryLog();
+ return false;
+ }
+
+ // Perform the allocation, still checking that the allocator
+ // has not run out of memory.
+ gpu::DeviceMemory<char> gpu_mem =
+ stream_exec_->AllocateArray<char>(this_region_size);
+ if (gpu_mem == nullptr) {
+ if (dump_log_on_failure) DumpMemoryLog();
+ return false;
+ }
+
+ // We never release memory once expanded.
+ allocated_memory_ += this_region_size;
+
+ Region* nr = new Region;
+ nr->ptr = static_cast<char*>(gpu_mem.opaque());
+
+ if (VLOG_IS_ON(2)) {
+ int64 free_bytes;
+ int64 total_bytes;
+ if (stream_exec_->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
+ VLOG(2) << "free " << free_bytes << " total " << total_bytes;
+ } else {
+ // Note: stream_exec call also logs internally on failure.
+ VLOG(2) << "could not retrieve memory usage";
+ }
+ }
+ VLOG(1) << "new Region of size " << this_region_size << " at "
+ << static_cast<void*>(nr->ptr) << " on device " << device_id_;
+ r = nr;
+ r->size = this_region_size;
+ r->next = r->ptr;
+ regions_.push_back(r);
+
+ for (auto visitor : region_visitors_) {
+ visitor(r->ptr, r->size);
+ }
+ }
+ } else {
+ // Allocate a new chunk and push on front of Pool.
+ Chunk* c = new Chunk;
+ c->ptr = r->next;
+ chunk_map_[c->ptr] = c;
+ c->size = chunk_size;
+ r->next += chunk_size;
+ c->next = pool->first;
+ if (c->next != nullptr) c->next->prev = c;
+ pool->first = c;
+ if (pool->last == nullptr) pool->last = c;
+ pool->num_chunks++;
+ pool->num_free++;
+ --num_chunks;
+ }
+ }
+
+ return true;
+}
+
+void GPURegionAllocator::CheckForMemoryLeaks() {
+ std::vector<string> errors;
+ mutex_lock l(lock_); // could use reader lock
+ for (auto pool_map : pools_) {
+ const Pool& p = pool_map.second;
+ Chunk* curr_chunk = p.first;
+ while (curr_chunk != nullptr) {
+ if (curr_chunk->in_use) {
+ errors.push_back(
+ strings::StrCat("Unfreed chunk of size ", curr_chunk->size));
+ }
+ curr_chunk = curr_chunk->next;
+ }
+ }
+ if (!errors.empty()) {
+ LOG(FATAL) << "GPU Memory leaks:\n" << str_util::Join(errors, "\n");
+ }
+}
+
+// Since there's no merging of chunks once allocated, we want to
+// maximize their reusablity (which argues for fewer, larger sizes),
+// while minimizing waste (which argues for tight-fitting sizes).
+//
+// The smallest unit of allocation is 256 bytes.
+// NOTE(tucker): akrizhevsky says that nvidia's memory manager always
+// aligns to 256 bytes, and doing so results in significant speedup.
+//
+// Up to 2^16 bytes we only allocate in powers of 2.
+//
+// Above that, we pick a max-waste which is the largest power
+// of 2 <= 1/16 of the requested size, then round up to the nearest
+// multiple of max_waste.
+//
+// static
+size_t GPURegionAllocator::ChunkSize(size_t bytes) {
+ if (bytes <= 256) {
+ return 256;
+ } else if (bytes <= (1 << 16)) {
+ return 1uLL << Log2Ceiling64(bytes);
+ } else {
+ // 1/16th of requested size
+ size_t max_waste = 1uLL << (Log2Ceiling64(bytes) - 4);
+ return (bytes + max_waste) & (~(max_waste - 1));
+ }
+}
+
+void GPURegionAllocator::AddAllocVisitor(Visitor visitor) {
+ VLOG(1) << "AddVisitor";
+ mutex_lock l(lock_);
+ region_visitors_.push_back(visitor);
+ for (auto region : regions_) {
+ visitor(region->ptr, region->size);
+ }
+}
+
+void GPURegionAllocator::DumpMemoryLog() {
+ size_t region_bytes = 0;
+ for (auto r : regions_) {
+ region_bytes += r->size;
+ }
+ size_t chunk_bytes = 0;
+ std::vector<size_t> chunk_sizes;
+ for (auto i : pools_) {
+ chunk_sizes.push_back(i.first);
+ }
+ std::sort(chunk_sizes.begin(), chunk_sizes.end());
+ for (auto i : chunk_sizes) {
+ int32 chunks_in_use = 0;
+ const Pool& p = pools_[i];
+ chunk_bytes += i * p.num_chunks;
+
+ if (p.num_chunks > 0) {
+ // Iterate backwards (allocated chunks are last).
+ Chunk* curr_chunk = p.last;
+ while (curr_chunk != nullptr) {
+ if (curr_chunk->in_use) {
+ ++chunks_in_use;
+ }
+ curr_chunk = curr_chunk->prev;
+ if (curr_chunk == p.first) {
+ break;
+ }
+ }
+ }
+
+ LOG(INFO) << "Chunk size: " << i << " ("
+ << strings::HumanReadableNumBytes(i) << ") Pool: " << p.ToString()
+ << "\nNumber of chunks: " << p.num_chunks
+ << ", in_use chunks: " << chunks_in_use;
+ }
+
+ LOG(INFO) << "Aggregate Region Memory: " << region_bytes << " ("
+ << strings::HumanReadableNumBytes(region_bytes) << ")";
+ LOG(INFO) << "Aggregate Chunk Memory: " << chunk_bytes << " ("
+ << strings::HumanReadableNumBytes(chunk_bytes) << ")";
+}
+
+bool GPURegionAllocator::TracksAllocationSizes() { return true; }
+
+size_t GPURegionAllocator::RequestedSize(void* ptr) {
+ mutex_lock l(lock_);
+ auto it = chunk_map_.find(ptr);
+ CHECK(it != chunk_map_.end())
+ << "Asked for requested size of pointer we never allocated: " << ptr;
+ auto c = it->second;
+ return c->bytes_allocated;
+}
+
+size_t GPURegionAllocator::AllocatedSize(void* ptr) {
+ mutex_lock l(lock_);
+ auto it = chunk_map_.find(ptr);
+ CHECK(it != chunk_map_.end())
+ << "Asked for allocated size of pointer we never allocated: " << ptr;
+ auto c = it->second;
+ return c->size;
+}
+
+} // namespace tensorflow