aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/common_runtime/gpu/pool_allocator.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/pool_allocator.h')
-rw-r--r--tensorflow/core/common_runtime/gpu/pool_allocator.h202
1 files changed, 202 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
new file mode 100644
index 0000000000..d10aabe88a
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -0,0 +1,202 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_
+
+// Simple LRU pool allocators for various flavors of CPU RAM that
+// implement the VisitableAllocator interface. GPU memory is managed
+// by GPURegionAllocator.
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace tensorflow {
+
+// Interface of an object that does the underlying alloc/free of memory.
+class SubAllocator {
+ public:
+ virtual ~SubAllocator() {}
+ virtual void* Alloc(size_t alignment, size_t num_bytes) = 0;
+ virtual void Free(void* ptr, size_t num_bytes) = 0;
+};
+
+// Interface of an object that rounds up integers.
+class RoundUpInterface {
+ public:
+ virtual ~RoundUpInterface() {}
+ virtual size_t RoundUp(size_t num_bytes) = 0;
+};
+
+// Size-limited pool of memory buffers obtained from a SubAllocator
+// instance. Pool eviction policy is LRU.
+class PoolAllocator : public VisitableAllocator {
+ public:
+ // "pool_size_limit" is the maximum number of returned, re-usable
+ // memory buffers to keep in the pool. If pool_size_limit == 0, the
+ // pool is effectively a thin wrapper around the allocator.
+ // If "auto_resize" is true, then the pool_size_limit will gradually
+ // be raised so that deallocations happen very rarely, if at all.
+ // Transitory start-up objects may deallocate, but the long-term
+ // working-set should not. Auto-resizing can raise pool_size_limit
+ // but will never lower it.
+ // "allocator" is the object that performs the underlying memory
+ // malloc/free operations. This object takes ownership of allocator.
+ PoolAllocator(size_t pool_size_limit, bool auto_resize,
+ SubAllocator* allocator, RoundUpInterface* size_rounder,
+ string name);
+ ~PoolAllocator() override;
+
+ string Name() override { return name_; }
+
+ void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+
+ void DeallocateRaw(void* ptr) override;
+
+ // REQUIRES: The following functions may only be called prior
+ // to the first Allocate*() call. Once allocation has begun, it is
+ // illegal to register another visitor.
+
+ void AddAllocVisitor(Visitor visitor) override;
+
+ void AddFreeVisitor(Visitor visitor) override;
+
+ // Allocate an unused memory region of size "num_bytes". Fetch from
+ // the pool if available, otherwise call allocator_.
+ void* Get(size_t num_bytes);
+
+ // Return a no-longer needed memory region to the pool. It is an error
+ // to deference "ptr" after this call. If the pool is full, the least
+ // recently used region will be deallocated.
+ void Put(void* ptr, size_t num_bytes);
+
+ // Reset the pool to empty.
+ void Clear();
+
+ // The following accessors permit monitoring the effectiveness of
+ // the pool at avoiding repeated malloc/frees on the underlying
+ // allocator. Read locks are not taken on the theory that value
+ // consistency with other threads is not important.
+
+ // Number of Get() requests satisfied from pool.
+ int64 get_from_pool_count() const NO_THREAD_SAFETY_ANALYSIS {
+ return get_from_pool_count_;
+ }
+ // Number of Put() requests.
+ int64 put_count() const NO_THREAD_SAFETY_ANALYSIS { return put_count_; }
+ // Number of Get() requests requiring a fresh allocation.
+ int64 allocated_count() const NO_THREAD_SAFETY_ANALYSIS {
+ return allocated_count_;
+ }
+ // Number of pool evictions.
+ int64 evicted_count() const NO_THREAD_SAFETY_ANALYSIS {
+ return evicted_count_;
+ }
+ // Current size limit.
+ size_t size_limit() const NO_THREAD_SAFETY_ANALYSIS {
+ return pool_size_limit_;
+ }
+
+ private:
+ struct PtrRecord {
+ void* ptr;
+ size_t num_bytes;
+ PtrRecord* prev;
+ PtrRecord* next;
+ };
+
+ // Remove "pr" from the double-linked LRU list.
+ void RemoveFromList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Add "pr" to the head of the double-linked LRU list.
+ void AddToList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Delete the least recently used record.
+ void EvictOne() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ const string name_;
+ const bool has_size_limit_;
+ const bool auto_resize_;
+ size_t pool_size_limit_;
+ std::unique_ptr<SubAllocator> allocator_;
+ std::unique_ptr<RoundUpInterface> size_rounder_;
+ mutex mutex_;
+ std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_);
+ PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr;
+ PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr;
+ int64 get_from_pool_count_ GUARDED_BY(mutex_) = 0;
+ int64 put_count_ GUARDED_BY(mutex_) = 0;
+ int64 allocated_count_ GUARDED_BY(mutex_) = 0;
+ int64 evicted_count_ GUARDED_BY(mutex_) = 0;
+ // Write access to these is guarded by mutex_, but not read
+ // access. They may only be modified prior to the first
+ // allocation. Later attempts to modify will fail.
+ std::vector<Visitor> alloc_visitors_;
+ std::vector<Visitor> free_visitors_;
+ std::atomic<bool> allocation_begun_;
+};
+
+// Do-nothing rounder. Passes through sizes unchanged.
+class NoopRounder : public RoundUpInterface {
+ public:
+ size_t RoundUp(size_t num_bytes) override { return num_bytes; }
+};
+
+// Power of 2 rounder: rounds up to nearest power of 2 size.
+class Pow2Rounder : public RoundUpInterface {
+ public:
+ size_t RoundUp(size_t num_bytes) override {
+ return 1uLL << Log2Ceiling64(num_bytes);
+ }
+};
+
+class BasicCPUAllocator : public SubAllocator {
+ public:
+ ~BasicCPUAllocator() override {}
+
+ void* Alloc(size_t alignment, size_t num_bytes) override {
+ return port::aligned_malloc(num_bytes, alignment);
+ }
+ void Free(void* ptr, size_t num_bytes) override { free(ptr); }
+};
+
+// Allocator for pinned CPU RAM that is made known to CUDA for the
+// purpose of efficient DMA with a GPU.
+class CUDAHostAllocator : public SubAllocator {
+ public:
+ // Note: stream_exec cannot be null.
+ explicit CUDAHostAllocator(perftools::gputools::StreamExecutor* stream_exec)
+ : stream_exec_(stream_exec) {
+ CHECK(stream_exec_ != nullptr);
+ }
+ ~CUDAHostAllocator() override {}
+
+ void* Alloc(size_t alignment, size_t num_bytes) override {
+ void* ptr = nullptr;
+ if (num_bytes > 0) {
+ ptr = stream_exec_->HostMemoryAllocate(num_bytes);
+ if (ptr == nullptr) {
+ LOG(FATAL) << "could not allocate pinned host memory of size: "
+ << num_bytes;
+ }
+ }
+ return ptr;
+ }
+
+ void Free(void* ptr, size_t num_bytes) override {
+ if (ptr != nullptr) {
+ stream_exec_->HostMemoryDeallocate(ptr);
+ }
+ }
+
+ private:
+ perftools::gputools::StreamExecutor* stream_exec_; // not owned, non-null
+
+ TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
+};
+
+} // namespace tensorflow
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_