#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h" //#include "base/commandlineflags.h" #include "tensorflow/stream_executor/multi_platform_manager.h" #include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h" #include "tensorflow/core/common_runtime/gpu/gpu_init.h" #include "tensorflow/core/lib/core/bits.h" #include "tensorflow/core/lib/gtl/stl_util.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/port.h" #if defined(PLATFORM_GOOGLE) DEFINE_bool(brain_gpu_region_allocator_heap_check_on_destruction, true, "If true, the CUDA gpu manager checks that all allocated " "memory through the GPU memory pool implementation has been " "freed."); DEFINE_int64(brain_gpu_region_allocator_region_size, 0, "If > 0, sets the default chunk-size allocatable from GPU memory. " "Else defaults to entire GPU memory."); #else bool FLAGS_brain_gpu_region_allocator_heap_check_on_destruction = true; tensorflow::int64 FLAGS_brain_gpu_region_allocator_region_size = 0; #endif namespace gpu = ::perftools::gputools; namespace tensorflow { GPURegionAllocator::GPURegionAllocator(int device_id, size_t total_bytes) : device_id_(device_id), total_bytes_(total_bytes) { // Get a pointer to the stream_executor for this device stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie(); // Set the region size based on explicit user request, or based on // total GPU capacity. if (FLAGS_brain_gpu_region_allocator_region_size > 0) { region_size_ = FLAGS_brain_gpu_region_allocator_region_size; } else { region_size_ = static_cast(total_bytes_); } LOG(INFO) << "Setting region size to " << region_size_; } GPURegionAllocator::~GPURegionAllocator() { if (FLAGS_brain_gpu_region_allocator_heap_check_on_destruction) { CheckForMemoryLeaks(); } gtl::STLDeleteValues(&chunk_map_); for (auto r : regions_) { gpu::DeviceMemoryBase gpu_ptr{r->ptr}; stream_exec_->Deallocate(&gpu_ptr); delete r; } } void* GPURegionAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { static const int64 kMaxMillisToWait = 10000; // 10 seconds return retry_helper_.AllocateRaw( [this](size_t a, size_t nb, bool v) { return AllocateRawInternal(a, nb, v); }, kMaxMillisToWait, alignment, num_bytes); } void* GPURegionAllocator::AllocateRawInternal(size_t alignment, size_t num_bytes, bool dump_log_on_failure) { if (num_bytes == 0) { LOG(ERROR) << "tried to allocate 0 bytes"; return nullptr; } size_t chunk_size = ChunkSize(num_bytes); VLOG(2) << "chunk_size " << chunk_size << " from num_bytes " << strings::HumanReadableNumBytes(num_bytes); mutex_lock l(lock_); Pool* pool = &pools_[chunk_size]; if (pool->num_free == 0) { if (!ExpandPool(pool, chunk_size, num_bytes, dump_log_on_failure)) { if (dump_log_on_failure) { LOG(WARNING) << "Out of GPU memory, see memory state dump above"; } return nullptr; } } CHECK_LT(0, pool->num_free); CHECK(pool->first); CHECK(pool->last); Chunk* c = pool->first; CHECK(c); CHECK(!c->in_use); c->in_use = true; // Move c to the back of the queue. if (c->next != nullptr) { pool->first = c->next; pool->first->prev = nullptr; c->next = nullptr; } if (pool->last != c) { pool->last->next = c; c->prev = pool->last; pool->last = c; } pool->num_free--; pool->cumulative_malloced++; void* rv = c->ptr; c->bytes_allocated = num_bytes; VLOG(2) << "new ptr " << rv; return rv; } void GPURegionAllocator::DeallocateRaw(void* ptr) { retry_helper_.DeallocateRaw([this](void* p) { DeallocateRawInternal(p); }, ptr); } void GPURegionAllocator::DeallocateRawInternal(void* ptr) { VLOG(2) << "DeallocateRaw: " << ptr; if (ptr == nullptr) { LOG(ERROR) << "tried to deallocate nullptr"; return; } mutex_lock l(lock_); ChunkMap::const_iterator iter = chunk_map_.find(ptr); CHECK(iter != chunk_map_.end()); Chunk* c = iter->second; VLOG(2) << "chunk of size " << c->size << " at " << c; Pool* pool = &(pools_[c->size]); // Move chunk to head of queue, and mark free. DCHECK(c->in_use); c->in_use = false; if (c->prev) c->prev->next = c->next; if (c->next) c->next->prev = c->prev; if (pool->first == c) pool->first = c->next; if (pool->last == c) pool->last = c->prev; c->next = pool->first; c->prev = nullptr; if (c->next) c->next->prev = c; pool->first = c; if (pool->last == nullptr) pool->last = c; pool->num_free++; pool->cumulative_freed++; } bool GPURegionAllocator::ExpandPool(Pool* pool, size_t chunk_size, size_t requested_size, bool dump_log_on_failure) { VLOG(1) << "ExpandPool of " << chunk_size << " from " << pool->num_chunks << " current members"; DCHECK_NE(0, chunk_size); // If chunk_size is < 4096, double the pool size. Otherwise // just increase by one. int num_chunks = pool->num_chunks; if (num_chunks == 0) { if (chunk_size > 4096) { num_chunks = 1; } else { num_chunks = 4096 / chunk_size; } } // For larger chunks, limit the amount of expansion. size_t aggregate_size = num_chunks * chunk_size; if (aggregate_size > (1 << 20)) { num_chunks = static_cast( std::max(static_cast(1), (1 << 20) / chunk_size)); } while (num_chunks > 0) { Region* r = (regions_.empty() ? nullptr : regions_.back()); if (r == nullptr || (((r->ptr + r->size) - r->next) < static_cast(chunk_size))) { // Current region is not large enough to accommodate another chunk. while (r == nullptr || (((r->ptr + r->size) - r->next) < static_cast(chunk_size))) { // Get another region. size_t this_region_size = std::max(region_size_, chunk_size); // Check if we would exceed our limit. if (allocated_memory_ + this_region_size > total_bytes_) { if (dump_log_on_failure) DumpMemoryLog(); return false; } // Perform the allocation, still checking that the allocator // has not run out of memory. gpu::DeviceMemory gpu_mem = stream_exec_->AllocateArray(this_region_size); if (gpu_mem == nullptr) { if (dump_log_on_failure) DumpMemoryLog(); return false; } // We never release memory once expanded. allocated_memory_ += this_region_size; Region* nr = new Region; nr->ptr = static_cast(gpu_mem.opaque()); if (VLOG_IS_ON(2)) { int64 free_bytes; int64 total_bytes; if (stream_exec_->DeviceMemoryUsage(&free_bytes, &total_bytes)) { VLOG(2) << "free " << free_bytes << " total " << total_bytes; } else { // Note: stream_exec call also logs internally on failure. VLOG(2) << "could not retrieve memory usage"; } } VLOG(1) << "new Region of size " << this_region_size << " at " << static_cast(nr->ptr) << " on device " << device_id_; r = nr; r->size = this_region_size; r->next = r->ptr; regions_.push_back(r); for (auto visitor : region_visitors_) { visitor(r->ptr, r->size); } } } else { // Allocate a new chunk and push on front of Pool. Chunk* c = new Chunk; c->ptr = r->next; chunk_map_[c->ptr] = c; c->size = chunk_size; r->next += chunk_size; c->next = pool->first; if (c->next != nullptr) c->next->prev = c; pool->first = c; if (pool->last == nullptr) pool->last = c; pool->num_chunks++; pool->num_free++; --num_chunks; } } return true; } void GPURegionAllocator::CheckForMemoryLeaks() { std::vector errors; mutex_lock l(lock_); // could use reader lock for (auto pool_map : pools_) { const Pool& p = pool_map.second; Chunk* curr_chunk = p.first; while (curr_chunk != nullptr) { if (curr_chunk->in_use) { errors.push_back( strings::StrCat("Unfreed chunk of size ", curr_chunk->size)); } curr_chunk = curr_chunk->next; } } if (!errors.empty()) { LOG(FATAL) << "GPU Memory leaks:\n" << str_util::Join(errors, "\n"); } } // Since there's no merging of chunks once allocated, we want to // maximize their reusablity (which argues for fewer, larger sizes), // while minimizing waste (which argues for tight-fitting sizes). // // The smallest unit of allocation is 256 bytes. // NOTE(tucker): akrizhevsky says that nvidia's memory manager always // aligns to 256 bytes, and doing so results in significant speedup. // // Up to 2^16 bytes we only allocate in powers of 2. // // Above that, we pick a max-waste which is the largest power // of 2 <= 1/16 of the requested size, then round up to the nearest // multiple of max_waste. // // static size_t GPURegionAllocator::ChunkSize(size_t bytes) { if (bytes <= 256) { return 256; } else if (bytes <= (1 << 16)) { return 1uLL << Log2Ceiling64(bytes); } else { // 1/16th of requested size size_t max_waste = 1uLL << (Log2Ceiling64(bytes) - 4); return (bytes + max_waste) & (~(max_waste - 1)); } } void GPURegionAllocator::AddAllocVisitor(Visitor visitor) { VLOG(1) << "AddVisitor"; mutex_lock l(lock_); region_visitors_.push_back(visitor); for (auto region : regions_) { visitor(region->ptr, region->size); } } void GPURegionAllocator::DumpMemoryLog() { size_t region_bytes = 0; for (auto r : regions_) { region_bytes += r->size; } size_t chunk_bytes = 0; std::vector chunk_sizes; for (auto i : pools_) { chunk_sizes.push_back(i.first); } std::sort(chunk_sizes.begin(), chunk_sizes.end()); for (auto i : chunk_sizes) { int32 chunks_in_use = 0; const Pool& p = pools_[i]; chunk_bytes += i * p.num_chunks; if (p.num_chunks > 0) { // Iterate backwards (allocated chunks are last). Chunk* curr_chunk = p.last; while (curr_chunk != nullptr) { if (curr_chunk->in_use) { ++chunks_in_use; } curr_chunk = curr_chunk->prev; if (curr_chunk == p.first) { break; } } } LOG(INFO) << "Chunk size: " << i << " (" << strings::HumanReadableNumBytes(i) << ") Pool: " << p.ToString() << "\nNumber of chunks: " << p.num_chunks << ", in_use chunks: " << chunks_in_use; } LOG(INFO) << "Aggregate Region Memory: " << region_bytes << " (" << strings::HumanReadableNumBytes(region_bytes) << ")"; LOG(INFO) << "Aggregate Chunk Memory: " << chunk_bytes << " (" << strings::HumanReadableNumBytes(chunk_bytes) << ")"; } bool GPURegionAllocator::TracksAllocationSizes() { return true; } size_t GPURegionAllocator::RequestedSize(void* ptr) { mutex_lock l(lock_); auto it = chunk_map_.find(ptr); CHECK(it != chunk_map_.end()) << "Asked for requested size of pointer we never allocated: " << ptr; auto c = it->second; return c->bytes_allocated; } size_t GPURegionAllocator::AllocatedSize(void* ptr) { mutex_lock l(lock_); auto it = chunk_map_.find(ptr); CHECK(it != chunk_map_.end()) << "Asked for allocated size of pointer we never allocated: " << ptr; auto c = it->second; return c->size; } } // namespace tensorflow