diff options
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc')
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc | 371 |
1 files changed, 371 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc new file mode 100644 index 0000000000..08ff55e221 --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc @@ -0,0 +1,371 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h" + +//#include "base/commandlineflags.h" +#include "tensorflow/stream_executor/multi_platform_manager.h" +#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" +#include "tensorflow/core/lib/core/bits.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" + +#if defined(PLATFORM_GOOGLE) +DEFINE_bool(brain_gpu_region_allocator_heap_check_on_destruction, true, + "If true, the CUDA gpu manager checks that all allocated " + "memory through the GPU memory pool implementation has been " + "freed."); + +DEFINE_int64(brain_gpu_region_allocator_region_size, 0, + "If > 0, sets the default chunk-size allocatable from GPU memory. " + "Else defaults to entire GPU memory."); + +#else +bool FLAGS_brain_gpu_region_allocator_heap_check_on_destruction = true; +tensorflow::int64 FLAGS_brain_gpu_region_allocator_region_size = 0; +#endif + +namespace gpu = ::perftools::gputools; + +namespace tensorflow { + +GPURegionAllocator::GPURegionAllocator(int device_id, size_t total_bytes) + : device_id_(device_id), total_bytes_(total_bytes) { + // Get a pointer to the stream_executor for this device + stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); + + // Set the region size based on explicit user request, or based on + // total GPU capacity. + if (FLAGS_brain_gpu_region_allocator_region_size > 0) { + region_size_ = FLAGS_brain_gpu_region_allocator_region_size; + } else { + region_size_ = static_cast<size_t>(total_bytes_); + } + + LOG(INFO) << "Setting region size to " << region_size_; +} + +GPURegionAllocator::~GPURegionAllocator() { + if (FLAGS_brain_gpu_region_allocator_heap_check_on_destruction) { + CheckForMemoryLeaks(); + } + + gtl::STLDeleteValues(&chunk_map_); + + for (auto r : regions_) { + gpu::DeviceMemoryBase gpu_ptr{r->ptr}; + stream_exec_->Deallocate(&gpu_ptr); + delete r; + } +} + +void* GPURegionAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { + static const int64 kMaxMillisToWait = 10000; // 10 seconds + return retry_helper_.AllocateRaw( + [this](size_t a, size_t nb, bool v) { + return AllocateRawInternal(a, nb, v); + }, + kMaxMillisToWait, alignment, num_bytes); +} + +void* GPURegionAllocator::AllocateRawInternal(size_t alignment, + size_t num_bytes, + bool dump_log_on_failure) { + if (num_bytes == 0) { + LOG(ERROR) << "tried to allocate 0 bytes"; + return nullptr; + } + size_t chunk_size = ChunkSize(num_bytes); + + VLOG(2) << "chunk_size " << chunk_size << " from num_bytes " + << strings::HumanReadableNumBytes(num_bytes); + mutex_lock l(lock_); + Pool* pool = &pools_[chunk_size]; + if (pool->num_free == 0) { + if (!ExpandPool(pool, chunk_size, num_bytes, dump_log_on_failure)) { + if (dump_log_on_failure) { + LOG(WARNING) << "Out of GPU memory, see memory state dump above"; + } + return nullptr; + } + } + CHECK_LT(0, pool->num_free); + CHECK(pool->first); + CHECK(pool->last); + Chunk* c = pool->first; + CHECK(c); + CHECK(!c->in_use); + + c->in_use = true; + // Move c to the back of the queue. + if (c->next != nullptr) { + pool->first = c->next; + pool->first->prev = nullptr; + c->next = nullptr; + } + + if (pool->last != c) { + pool->last->next = c; + c->prev = pool->last; + pool->last = c; + } + pool->num_free--; + pool->cumulative_malloced++; + + void* rv = c->ptr; + c->bytes_allocated = num_bytes; + + VLOG(2) << "new ptr " << rv; + return rv; +} + +void GPURegionAllocator::DeallocateRaw(void* ptr) { + retry_helper_.DeallocateRaw([this](void* p) { DeallocateRawInternal(p); }, + ptr); +} + +void GPURegionAllocator::DeallocateRawInternal(void* ptr) { + VLOG(2) << "DeallocateRaw: " << ptr; + if (ptr == nullptr) { + LOG(ERROR) << "tried to deallocate nullptr"; + return; + } + + mutex_lock l(lock_); + ChunkMap::const_iterator iter = chunk_map_.find(ptr); + CHECK(iter != chunk_map_.end()); + + Chunk* c = iter->second; + VLOG(2) << "chunk of size " << c->size << " at " << c; + + Pool* pool = &(pools_[c->size]); + // Move chunk to head of queue, and mark free. + DCHECK(c->in_use); + c->in_use = false; + if (c->prev) c->prev->next = c->next; + if (c->next) c->next->prev = c->prev; + if (pool->first == c) pool->first = c->next; + if (pool->last == c) pool->last = c->prev; + c->next = pool->first; + c->prev = nullptr; + if (c->next) c->next->prev = c; + pool->first = c; + if (pool->last == nullptr) pool->last = c; + pool->num_free++; + pool->cumulative_freed++; +} + +bool GPURegionAllocator::ExpandPool(Pool* pool, size_t chunk_size, + size_t requested_size, + bool dump_log_on_failure) { + VLOG(1) << "ExpandPool of " << chunk_size << " from " << pool->num_chunks + << " current members"; + DCHECK_NE(0, chunk_size); + // If chunk_size is < 4096, double the pool size. Otherwise + // just increase by one. + int num_chunks = pool->num_chunks; + if (num_chunks == 0) { + if (chunk_size > 4096) { + num_chunks = 1; + } else { + num_chunks = 4096 / chunk_size; + } + } + // For larger chunks, limit the amount of expansion. + size_t aggregate_size = num_chunks * chunk_size; + if (aggregate_size > (1 << 20)) { + num_chunks = static_cast<int>( + std::max(static_cast<size_t>(1), (1 << 20) / chunk_size)); + } + while (num_chunks > 0) { + Region* r = (regions_.empty() ? nullptr : regions_.back()); + if (r == nullptr || + (((r->ptr + r->size) - r->next) < static_cast<int64>(chunk_size))) { + // Current region is not large enough to accommodate another chunk. + while (r == nullptr || (((r->ptr + r->size) - r->next) < + static_cast<int64>(chunk_size))) { + // Get another region. + size_t this_region_size = std::max(region_size_, chunk_size); + + // Check if we would exceed our limit. + if (allocated_memory_ + this_region_size > total_bytes_) { + if (dump_log_on_failure) DumpMemoryLog(); + return false; + } + + // Perform the allocation, still checking that the allocator + // has not run out of memory. + gpu::DeviceMemory<char> gpu_mem = + stream_exec_->AllocateArray<char>(this_region_size); + if (gpu_mem == nullptr) { + if (dump_log_on_failure) DumpMemoryLog(); + return false; + } + + // We never release memory once expanded. + allocated_memory_ += this_region_size; + + Region* nr = new Region; + nr->ptr = static_cast<char*>(gpu_mem.opaque()); + + if (VLOG_IS_ON(2)) { + int64 free_bytes; + int64 total_bytes; + if (stream_exec_->DeviceMemoryUsage(&free_bytes, &total_bytes)) { + VLOG(2) << "free " << free_bytes << " total " << total_bytes; + } else { + // Note: stream_exec call also logs internally on failure. + VLOG(2) << "could not retrieve memory usage"; + } + } + VLOG(1) << "new Region of size " << this_region_size << " at " + << static_cast<void*>(nr->ptr) << " on device " << device_id_; + r = nr; + r->size = this_region_size; + r->next = r->ptr; + regions_.push_back(r); + + for (auto visitor : region_visitors_) { + visitor(r->ptr, r->size); + } + } + } else { + // Allocate a new chunk and push on front of Pool. + Chunk* c = new Chunk; + c->ptr = r->next; + chunk_map_[c->ptr] = c; + c->size = chunk_size; + r->next += chunk_size; + c->next = pool->first; + if (c->next != nullptr) c->next->prev = c; + pool->first = c; + if (pool->last == nullptr) pool->last = c; + pool->num_chunks++; + pool->num_free++; + --num_chunks; + } + } + + return true; +} + +void GPURegionAllocator::CheckForMemoryLeaks() { + std::vector<string> errors; + mutex_lock l(lock_); // could use reader lock + for (auto pool_map : pools_) { + const Pool& p = pool_map.second; + Chunk* curr_chunk = p.first; + while (curr_chunk != nullptr) { + if (curr_chunk->in_use) { + errors.push_back( + strings::StrCat("Unfreed chunk of size ", curr_chunk->size)); + } + curr_chunk = curr_chunk->next; + } + } + if (!errors.empty()) { + LOG(FATAL) << "GPU Memory leaks:\n" << str_util::Join(errors, "\n"); + } +} + +// Since there's no merging of chunks once allocated, we want to +// maximize their reusablity (which argues for fewer, larger sizes), +// while minimizing waste (which argues for tight-fitting sizes). +// +// The smallest unit of allocation is 256 bytes. +// NOTE(tucker): akrizhevsky says that nvidia's memory manager always +// aligns to 256 bytes, and doing so results in significant speedup. +// +// Up to 2^16 bytes we only allocate in powers of 2. +// +// Above that, we pick a max-waste which is the largest power +// of 2 <= 1/16 of the requested size, then round up to the nearest +// multiple of max_waste. +// +// static +size_t GPURegionAllocator::ChunkSize(size_t bytes) { + if (bytes <= 256) { + return 256; + } else if (bytes <= (1 << 16)) { + return 1uLL << Log2Ceiling64(bytes); + } else { + // 1/16th of requested size + size_t max_waste = 1uLL << (Log2Ceiling64(bytes) - 4); + return (bytes + max_waste) & (~(max_waste - 1)); + } +} + +void GPURegionAllocator::AddAllocVisitor(Visitor visitor) { + VLOG(1) << "AddVisitor"; + mutex_lock l(lock_); + region_visitors_.push_back(visitor); + for (auto region : regions_) { + visitor(region->ptr, region->size); + } +} + +void GPURegionAllocator::DumpMemoryLog() { + size_t region_bytes = 0; + for (auto r : regions_) { + region_bytes += r->size; + } + size_t chunk_bytes = 0; + std::vector<size_t> chunk_sizes; + for (auto i : pools_) { + chunk_sizes.push_back(i.first); + } + std::sort(chunk_sizes.begin(), chunk_sizes.end()); + for (auto i : chunk_sizes) { + int32 chunks_in_use = 0; + const Pool& p = pools_[i]; + chunk_bytes += i * p.num_chunks; + + if (p.num_chunks > 0) { + // Iterate backwards (allocated chunks are last). + Chunk* curr_chunk = p.last; + while (curr_chunk != nullptr) { + if (curr_chunk->in_use) { + ++chunks_in_use; + } + curr_chunk = curr_chunk->prev; + if (curr_chunk == p.first) { + break; + } + } + } + + LOG(INFO) << "Chunk size: " << i << " (" + << strings::HumanReadableNumBytes(i) << ") Pool: " << p.ToString() + << "\nNumber of chunks: " << p.num_chunks + << ", in_use chunks: " << chunks_in_use; + } + + LOG(INFO) << "Aggregate Region Memory: " << region_bytes << " (" + << strings::HumanReadableNumBytes(region_bytes) << ")"; + LOG(INFO) << "Aggregate Chunk Memory: " << chunk_bytes << " (" + << strings::HumanReadableNumBytes(chunk_bytes) << ")"; +} + +bool GPURegionAllocator::TracksAllocationSizes() { return true; } + +size_t GPURegionAllocator::RequestedSize(void* ptr) { + mutex_lock l(lock_); + auto it = chunk_map_.find(ptr); + CHECK(it != chunk_map_.end()) + << "Asked for requested size of pointer we never allocated: " << ptr; + auto c = it->second; + return c->bytes_allocated; +} + +size_t GPURegionAllocator::AllocatedSize(void* ptr) { + mutex_lock l(lock_); + auto it = chunk_map_.find(ptr); + CHECK(it != chunk_map_.end()) + << "Asked for allocated size of pointer we never allocated: " << ptr; + auto c = it->second; + return c->size; +} + +} // namespace tensorflow |