aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc')
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc397
1 files changed, 397 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
new file mode 100644
index 0000000000..3df833594f
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -0,0 +1,397 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
+
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory)
+ : device_id_(device_id) {
+ // Get a pointer to the stream_executor for this device
+ stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+
+ // Allocate the requested amount of memory.
+ gpu_memory_size_ = total_memory;
+
+ LOG(INFO) << "Allocating " << strings::HumanReadableNumBytes(gpu_memory_size_)
+ << " bytes.";
+ gpu::DeviceMemory<char> gpu_mem =
+ stream_exec_->AllocateArray<char>(gpu_memory_size_);
+
+ QCHECK(gpu_mem != nullptr)
+ << " Could not allocate GPU device memory for device " << device_id
+ << ". Tried to allocate "
+ << strings::HumanReadableNumBytes(gpu_memory_size_);
+ base_ptr_ = gpu_mem.opaque();
+ LOG(INFO) << "GPU " << device_id << " memory begins at " << base_ptr_
+ << " extends to "
+ << static_cast<void*>(
+ (static_cast<char*>(base_ptr_) + gpu_memory_size_));
+
+ // Create a bunch of bins of various good sizes.
+
+ // Covers allocations of exactly 256 bytes (the minimum size).
+ bins_.insert(std::make_pair(256, new Bin(256)));
+
+ // We create bins to fit all possible ranges that cover the
+ // gpu_memory_size_ starting from allocations up to 1024 bytes to
+ // allocations up to (and including) the memory limit.
+ for (size_t bin_size = 1024; bin_size < gpu_memory_size_ * 2; bin_size *= 2) {
+ LOG(INFO) << "Creating bin of max chunk size "
+ << strings::HumanReadableNumBytes(bin_size);
+ bins_.insert(std::make_pair(bin_size, new Bin(bin_size)));
+ }
+
+ // Create one large chunk for the whole memory space that will
+ // be chunked later.
+ GPUBFCAllocator::Chunk* c = new GPUBFCAllocator::Chunk();
+ c->ptr = gpu_mem.opaque();
+ c->size = gpu_memory_size_;
+ c->in_use = false;
+ c->prev = nullptr;
+ c->next = nullptr;
+
+ ptr_to_chunk_map_.insert(std::make_pair(c->ptr, c));
+
+ // Insert the chunk into the right bin.
+ ReassignChunkToBin(c);
+}
+
+GPUBFCAllocator::~GPUBFCAllocator() {
+ // Return memory back.
+ if (base_ptr_) {
+ gpu::DeviceMemoryBase gpu_ptr{base_ptr_};
+ stream_exec_->Deallocate(&gpu_ptr);
+ }
+
+ gtl::STLDeleteValues(&bins_);
+}
+
+void* GPUBFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) {
+ static const int64 kMaxMillisToWait = 10000; // 10 seconds
+ return retry_helper_.AllocateRaw(
+ [this](size_t a, size_t nb, bool v) {
+ return AllocateRawInternal(a, nb, v);
+ },
+ kMaxMillisToWait, unused_alignment, num_bytes);
+}
+
+void* GPUBFCAllocator::AllocateRawInternal(size_t unused_alignment,
+ size_t num_bytes,
+ bool dump_log_on_failure) {
+ if (num_bytes == 0) {
+ LOG(ERROR) << "tried to allocate 0 bytes";
+ return nullptr;
+ }
+ // First, always allocate memory of at least 256 bytes, and always
+ // allocate multiples of 256 bytes so all memory addresses are
+ // nicely byte aligned.
+ size_t rounded_bytes = (256 * ((num_bytes + 255) / 256));
+ DCHECK_EQ(0, rounded_bytes % 256);
+
+ // The BFC allocator tries to find the best fit first.
+ //
+ // First identify the first bin that could satisfy rounded_bytes.
+ auto it = bins_.lower_bound(rounded_bytes);
+ if (it == bins_.end()) {
+ LOG(ERROR) << " Asked for " << rounded_bytes << " but largest bin was "
+ << bins_.rbegin()->first;
+ return nullptr;
+ }
+
+ mutex_lock l(lock_);
+ for (; it != bins_.end(); ++it) {
+ // Start searching from the first bin for the smallest chunk that fits
+ // rounded_bytes.
+ Bin* b = it->second;
+ for (GPUBFCAllocator::Chunk* chunk : b->chunks) {
+ if (!chunk->in_use && chunk->size > rounded_bytes) {
+ // We found an existing chunk that fits us that wasn't in use.
+ chunk->in_use = true;
+
+ // If we can break the size of the chunk into two reasonably
+ // large pieces, do so.
+ //
+ // TODO(vrv): What should be the criteria when deciding when
+ // to split?
+ if (chunk->size >= rounded_bytes * 2) {
+ SplitChunk(chunk, rounded_bytes);
+ }
+
+ // The requested size of the returned chunk is what the user
+ // has allocated.
+ chunk->requested_size = num_bytes;
+
+ VLOG(4) << "Returning: " << chunk->ptr;
+ return chunk->ptr;
+ }
+ }
+ }
+
+ // We searched all bins for an existing free chunk to use and
+ // couldn't find one. This means we must have run out of memory,
+ // Dump the memory log for analysis.
+ if (dump_log_on_failure) {
+ DumpMemoryLog(rounded_bytes);
+ LOG(WARNING) << "Ran out of memory trying to allocate "
+ << strings::HumanReadableNumBytes(num_bytes)
+ << ". See logs for memory state";
+ }
+ return nullptr;
+}
+
+void GPUBFCAllocator::SplitChunk(GPUBFCAllocator::Chunk* c, size_t num_bytes) {
+ // Create a new chunk starting num_bytes after c
+ GPUBFCAllocator::Chunk* new_chunk = new GPUBFCAllocator::Chunk();
+ new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
+ VLOG(6) << "Adding to chunk map: " << new_chunk->ptr;
+ ptr_to_chunk_map_.insert(std::make_pair(new_chunk->ptr, new_chunk));
+
+ // Set the new sizes of the chunks.
+ new_chunk->size = c->size - num_bytes;
+ c->size = num_bytes;
+
+ // The new chunk is not in use.
+ new_chunk->in_use = false;
+
+ // Maintain the pointers.
+ // c <-> c_neighbor becomes
+ // c <-> new_chunk <-> c_neighbor
+ GPUBFCAllocator::Chunk* c_neighbor = c->next;
+ new_chunk->prev = c;
+ new_chunk->next = c_neighbor;
+ c->next = new_chunk;
+ if (c_neighbor) {
+ c_neighbor->prev = new_chunk;
+ }
+
+ // Maintain the bins
+ ReassignChunkToBin(new_chunk);
+ ReassignChunkToBin(c);
+}
+
+void GPUBFCAllocator::DeallocateRaw(void* ptr) {
+ retry_helper_.DeallocateRaw([this](void* p) { DeallocateRawInternal(p); },
+ ptr);
+}
+
+void GPUBFCAllocator::DeallocateRawInternal(void* ptr) {
+ if (ptr == nullptr) {
+ LOG(ERROR) << "tried to deallocate nullptr";
+ return;
+ }
+ mutex_lock l(lock_);
+
+ // Find the chunk from the ptr.
+ auto it = ptr_to_chunk_map_.find(ptr);
+ CHECK(it != ptr_to_chunk_map_.end())
+ << "Asked to deallocate a pointer we never allocated: " << ptr;
+
+ GPUBFCAllocator::Chunk* c = it->second;
+ VLOG(6) << "Chunk at " << c->ptr << " no longer in use";
+ // Mark the chunk as no longer in use
+ c->in_use = false;
+
+ // Consider coalescing it.
+ MaybeCoalesce(c);
+}
+
+// Merges c1 and c2 when c1->next is c2 and c2->prev is c1.
+// We merge c2 into c1.
+void GPUBFCAllocator::Merge(GPUBFCAllocator::Chunk* c1,
+ GPUBFCAllocator::Chunk* c2) {
+ // We can only merge chunks that are not in use.
+ DCHECK(!c1->in_use && !c2->in_use);
+
+ // c1's prev doesn't change, still points to the same ptr, and is
+ // still not in use.
+
+ // Fix up neighbor pointers
+ //
+ // c1 <-> c2 <-> c3 should become
+ // c1 <-> c3
+ GPUBFCAllocator::Chunk* c3 = c2->next;
+ c1->next = c3;
+ CHECK(c2->prev == c1);
+ if (c3 != nullptr) {
+ c3->prev = c1;
+ }
+
+ // Set the new size
+ c1->size += c2->size;
+
+ // Delete c2 and cleanup all state
+ RemoveChunkFromBin(c2);
+}
+
+void GPUBFCAllocator::ReassignChunkToBin(GPUBFCAllocator::Chunk* c) {
+ auto it = bins_.lower_bound(c->size);
+ CHECK(it != bins_.end()) << " Tried to reassign to non-existent bin for size "
+ << c->size;
+
+ Bin* new_bin = it->second;
+
+ // If the bin has not changed, do nothing.
+ Bin* old_bin = c->bin;
+ if (old_bin != nullptr && new_bin == old_bin) {
+ return;
+ }
+
+ // The bin has changed. Add the chunk to the new bin and remove
+ // the chunk from the old bin.
+ new_bin->chunks.insert(c);
+ c->bin = new_bin;
+
+ if (old_bin == nullptr) {
+ return;
+ }
+
+ // Remove chunk from old bin
+ for (auto it = old_bin->chunks.begin(); it != old_bin->chunks.end(); ++it) {
+ if (*it == c) {
+ old_bin->chunks.erase(it);
+ return;
+ }
+ }
+ CHECK(false) << "Could not find chunk in old bin";
+}
+
+void GPUBFCAllocator::RemoveChunkFromBin(GPUBFCAllocator::Chunk* c) {
+ Bin* b = c->bin;
+ for (auto it = b->chunks.begin(); it != b->chunks.end(); ++it) {
+ Chunk* other_c = *it;
+ if (other_c->ptr == c->ptr) {
+ b->chunks.erase(it);
+ VLOG(4) << "Removing: " << c->ptr;
+ ptr_to_chunk_map_.erase(c->ptr);
+ delete c;
+ return;
+ }
+ }
+
+ CHECK(false) << "Could not find chunk in bin";
+}
+
+void GPUBFCAllocator::MaybeCoalesce(GPUBFCAllocator::Chunk* c) {
+ // This chunk is no longer in-use, consider coalescing the chunk
+ // with adjacent chunks.
+ Chunk* chunk_to_reassign = nullptr;
+
+ // If the next chunk is free, coalesce the two, if the result would
+ // fit in an existing bin.
+ if (c->next && !c->next->in_use) {
+ VLOG(8) << "Chunk at " << c->next->ptr << " merging with c " << c->ptr;
+
+ chunk_to_reassign = c;
+
+ // Deletes c->next
+ Merge(c, c->next);
+ }
+
+ // If the previous chunk is free, coalesce the two
+ if (c->prev && !c->prev->in_use) {
+ VLOG(8) << "Chunk at " << c->ptr << " merging into c->prev "
+ << c->prev->ptr;
+
+ chunk_to_reassign = c->prev;
+
+ // Deletes c
+ Merge(c->prev, c);
+ }
+
+ // Reassign the final merged chunk into the right bin.
+ if (chunk_to_reassign) {
+ ReassignChunkToBin(chunk_to_reassign);
+ }
+}
+
+void GPUBFCAllocator::AddAllocVisitor(Visitor visitor) {
+ VLOG(1) << "AddVisitor";
+ mutex_lock l(lock_);
+ region_visitors_.push_back(visitor);
+ visitor(base_ptr_, gpu_memory_size_);
+}
+
+bool GPUBFCAllocator::TracksAllocationSizes() { return true; }
+
+size_t GPUBFCAllocator::RequestedSize(void* ptr) {
+ mutex_lock l(lock_);
+ auto it = ptr_to_chunk_map_.find(ptr);
+ CHECK(it != ptr_to_chunk_map_.end())
+ << "Asked for requested size of pointer we never allocated: " << ptr;
+ GPUBFCAllocator::Chunk* c = it->second;
+ return c->requested_size;
+}
+
+size_t GPUBFCAllocator::AllocatedSize(void* ptr) {
+ mutex_lock l(lock_);
+ auto it = ptr_to_chunk_map_.find(ptr);
+ CHECK(it != ptr_to_chunk_map_.end())
+ << "Asked for allocated size of pointer we never allocated: " << ptr;
+ GPUBFCAllocator::Chunk* c = it->second;
+ return c->size;
+}
+
+void GPUBFCAllocator::DumpMemoryLog(size_t num_bytes) {
+ // For each bin: tally up the total number of chunks and bytes.
+ for (auto bit : bins_) {
+ Bin* b = bit.second;
+
+ size_t total_bytes_in_use = 0;
+ size_t total_bytes_in_bin = 0;
+ size_t total_requested_bytes_in_use = 0;
+ size_t total_requested_bytes_in_bin = 0;
+ size_t total_chunks_in_use = 0;
+ size_t total_chunks_in_bin = 0;
+ for (Chunk* c : b->chunks) {
+ total_bytes_in_bin += c->size;
+ total_requested_bytes_in_bin += c->requested_size;
+ ++total_chunks_in_bin;
+ if (c->in_use) {
+ total_bytes_in_use += c->size;
+ total_requested_bytes_in_use += c->requested_size;
+ ++total_chunks_in_use;
+ }
+ }
+
+ LOG(INFO) << "Bin (" << b->bin_size
+ << "): \tTotal Chunks: " << total_chunks_in_bin
+ << ", Chunks in use: " << total_chunks_in_use << " "
+ << strings::HumanReadableNumBytes(total_bytes_in_bin)
+ << " allocated for chunks. "
+ << strings::HumanReadableNumBytes(total_requested_bytes_in_bin)
+ << " client-requested for chunks. "
+ << strings::HumanReadableNumBytes(total_bytes_in_use)
+ << " in use in bin. "
+ << strings::HumanReadableNumBytes(total_requested_bytes_in_use)
+ << " client-requested in use in bin.";
+ }
+
+ // Find the bin that we would have liked to allocate in, so we
+ // can get some further analysis about fragmentation.
+ auto it = bins_.lower_bound(num_bytes);
+ if (it != bins_.end()) {
+ Bin* b = it->second;
+
+ LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
+ << " was " << strings::HumanReadableNumBytes(b->bin_size)
+ << ", Chunk State: ";
+
+ for (Chunk* c : b->chunks) {
+ LOG(INFO) << c->DebugString(true);
+ }
+ }
+}
+
+} // namespace tensorflow