diff options
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/pool_allocator.h')
-rw-r--r-- | tensorflow/core/common_runtime/gpu/pool_allocator.h | 202 |
1 files changed, 202 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h new file mode 100644 index 0000000000..d10aabe88a --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h @@ -0,0 +1,202 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_ +#define TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_ + +// Simple LRU pool allocators for various flavors of CPU RAM that +// implement the VisitableAllocator interface. GPU memory is managed +// by GPURegionAllocator. + +#include <atomic> +#include <map> +#include <memory> +#include "tensorflow/core/lib/core/bits.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h" +#include "tensorflow/stream_executor/stream_executor.h" + +namespace tensorflow { + +// Interface of an object that does the underlying alloc/free of memory. +class SubAllocator { + public: + virtual ~SubAllocator() {} + virtual void* Alloc(size_t alignment, size_t num_bytes) = 0; + virtual void Free(void* ptr, size_t num_bytes) = 0; +}; + +// Interface of an object that rounds up integers. +class RoundUpInterface { + public: + virtual ~RoundUpInterface() {} + virtual size_t RoundUp(size_t num_bytes) = 0; +}; + +// Size-limited pool of memory buffers obtained from a SubAllocator +// instance. Pool eviction policy is LRU. +class PoolAllocator : public VisitableAllocator { + public: + // "pool_size_limit" is the maximum number of returned, re-usable + // memory buffers to keep in the pool. If pool_size_limit == 0, the + // pool is effectively a thin wrapper around the allocator. + // If "auto_resize" is true, then the pool_size_limit will gradually + // be raised so that deallocations happen very rarely, if at all. + // Transitory start-up objects may deallocate, but the long-term + // working-set should not. Auto-resizing can raise pool_size_limit + // but will never lower it. + // "allocator" is the object that performs the underlying memory + // malloc/free operations. This object takes ownership of allocator. + PoolAllocator(size_t pool_size_limit, bool auto_resize, + SubAllocator* allocator, RoundUpInterface* size_rounder, + string name); + ~PoolAllocator() override; + + string Name() override { return name_; } + + void* AllocateRaw(size_t alignment, size_t num_bytes) override; + + void DeallocateRaw(void* ptr) override; + + // REQUIRES: The following functions may only be called prior + // to the first Allocate*() call. Once allocation has begun, it is + // illegal to register another visitor. + + void AddAllocVisitor(Visitor visitor) override; + + void AddFreeVisitor(Visitor visitor) override; + + // Allocate an unused memory region of size "num_bytes". Fetch from + // the pool if available, otherwise call allocator_. + void* Get(size_t num_bytes); + + // Return a no-longer needed memory region to the pool. It is an error + // to deference "ptr" after this call. If the pool is full, the least + // recently used region will be deallocated. + void Put(void* ptr, size_t num_bytes); + + // Reset the pool to empty. + void Clear(); + + // The following accessors permit monitoring the effectiveness of + // the pool at avoiding repeated malloc/frees on the underlying + // allocator. Read locks are not taken on the theory that value + // consistency with other threads is not important. + + // Number of Get() requests satisfied from pool. + int64 get_from_pool_count() const NO_THREAD_SAFETY_ANALYSIS { + return get_from_pool_count_; + } + // Number of Put() requests. + int64 put_count() const NO_THREAD_SAFETY_ANALYSIS { return put_count_; } + // Number of Get() requests requiring a fresh allocation. + int64 allocated_count() const NO_THREAD_SAFETY_ANALYSIS { + return allocated_count_; + } + // Number of pool evictions. + int64 evicted_count() const NO_THREAD_SAFETY_ANALYSIS { + return evicted_count_; + } + // Current size limit. + size_t size_limit() const NO_THREAD_SAFETY_ANALYSIS { + return pool_size_limit_; + } + + private: + struct PtrRecord { + void* ptr; + size_t num_bytes; + PtrRecord* prev; + PtrRecord* next; + }; + + // Remove "pr" from the double-linked LRU list. + void RemoveFromList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Add "pr" to the head of the double-linked LRU list. + void AddToList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Delete the least recently used record. + void EvictOne() EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + const string name_; + const bool has_size_limit_; + const bool auto_resize_; + size_t pool_size_limit_; + std::unique_ptr<SubAllocator> allocator_; + std::unique_ptr<RoundUpInterface> size_rounder_; + mutex mutex_; + std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_); + PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr; + PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr; + int64 get_from_pool_count_ GUARDED_BY(mutex_) = 0; + int64 put_count_ GUARDED_BY(mutex_) = 0; + int64 allocated_count_ GUARDED_BY(mutex_) = 0; + int64 evicted_count_ GUARDED_BY(mutex_) = 0; + // Write access to these is guarded by mutex_, but not read + // access. They may only be modified prior to the first + // allocation. Later attempts to modify will fail. + std::vector<Visitor> alloc_visitors_; + std::vector<Visitor> free_visitors_; + std::atomic<bool> allocation_begun_; +}; + +// Do-nothing rounder. Passes through sizes unchanged. +class NoopRounder : public RoundUpInterface { + public: + size_t RoundUp(size_t num_bytes) override { return num_bytes; } +}; + +// Power of 2 rounder: rounds up to nearest power of 2 size. +class Pow2Rounder : public RoundUpInterface { + public: + size_t RoundUp(size_t num_bytes) override { + return 1uLL << Log2Ceiling64(num_bytes); + } +}; + +class BasicCPUAllocator : public SubAllocator { + public: + ~BasicCPUAllocator() override {} + + void* Alloc(size_t alignment, size_t num_bytes) override { + return port::aligned_malloc(num_bytes, alignment); + } + void Free(void* ptr, size_t num_bytes) override { free(ptr); } +}; + +// Allocator for pinned CPU RAM that is made known to CUDA for the +// purpose of efficient DMA with a GPU. +class CUDAHostAllocator : public SubAllocator { + public: + // Note: stream_exec cannot be null. + explicit CUDAHostAllocator(perftools::gputools::StreamExecutor* stream_exec) + : stream_exec_(stream_exec) { + CHECK(stream_exec_ != nullptr); + } + ~CUDAHostAllocator() override {} + + void* Alloc(size_t alignment, size_t num_bytes) override { + void* ptr = nullptr; + if (num_bytes > 0) { + ptr = stream_exec_->HostMemoryAllocate(num_bytes); + if (ptr == nullptr) { + LOG(FATAL) << "could not allocate pinned host memory of size: " + << num_bytes; + } + } + return ptr; + } + + void Free(void* ptr, size_t num_bytes) override { + if (ptr != nullptr) { + stream_exec_->HostMemoryDeallocate(ptr); + } + } + + private: + perftools::gputools::StreamExecutor* stream_exec_; // not owned, non-null + + TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator); +}; + +} // namespace tensorflow +#endif // TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_ |