1 files changed, 202 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
new file mode 100644
index 0000000000..d10aabe88a
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -0,0 +1,202 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_
+
+// Simple LRU pool allocators for various flavors of CPU RAM that
+// implement the VisitableAllocator interface. GPU memory is managed
+// by GPURegionAllocator.
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace tensorflow {
+
+// Interface of an object that does the underlying alloc/free of memory.
+class SubAllocator {
+ public:
+  virtual ~SubAllocator() {}
+  virtual void* Alloc(size_t alignment, size_t num_bytes) = 0;
+  virtual void Free(void* ptr, size_t num_bytes) = 0;
+};
+
+// Interface of an object that rounds up integers.
+class RoundUpInterface {
+ public:
+  virtual ~RoundUpInterface() {}
+  virtual size_t RoundUp(size_t num_bytes) = 0;
+};
+
+// Size-limited pool of memory buffers obtained from a SubAllocator
+// instance.  Pool eviction policy is LRU.
+class PoolAllocator : public VisitableAllocator {
+ public:
+  // "pool_size_limit" is the maximum number of returned, re-usable
+  // memory buffers to keep in the pool.  If pool_size_limit == 0, the
+  // pool is effectively a thin wrapper around the allocator.
+  // If "auto_resize" is true, then the pool_size_limit will gradually
+  // be raised so that deallocations happen very rarely, if at all.
+  // Transitory start-up objects may deallocate, but the long-term
+  // working-set should not. Auto-resizing can raise pool_size_limit
+  // but will never lower it.
+  // "allocator" is the object that performs the underlying memory
+  // malloc/free operations.  This object takes ownership of allocator.
+  PoolAllocator(size_t pool_size_limit, bool auto_resize,
+                SubAllocator* allocator, RoundUpInterface* size_rounder,
+                string name);
+  ~PoolAllocator() override;
+
+  string Name() override { return name_; }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+
+  void DeallocateRaw(void* ptr) override;
+
+  // REQUIRES: The following functions may only be called prior
+  // to the first Allocate*() call.  Once allocation has begun, it is
+  // illegal to register another visitor.
+
+  void AddAllocVisitor(Visitor visitor) override;
+
+  void AddFreeVisitor(Visitor visitor) override;
+
+  // Allocate an unused memory region of size "num_bytes".  Fetch from
+  // the pool if available, otherwise call allocator_.
+  void* Get(size_t num_bytes);
+
+  // Return a no-longer needed memory region to the pool.  It is an error
+  // to deference "ptr" after this call.  If the pool is full, the least
+  // recently used region will be deallocated.
+  void Put(void* ptr, size_t num_bytes);
+
+  // Reset the pool to empty.
+  void Clear();
+
+  // The following accessors permit monitoring the effectiveness of
+  // the pool at avoiding repeated malloc/frees on the underlying
+  // allocator.  Read locks are not taken on the theory that value
+  // consistency with other threads is not important.
+
+  // Number of Get() requests satisfied from pool.
+  int64 get_from_pool_count() const NO_THREAD_SAFETY_ANALYSIS {
+    return get_from_pool_count_;
+  }
+  // Number of Put() requests.
+  int64 put_count() const NO_THREAD_SAFETY_ANALYSIS { return put_count_; }
+  // Number of Get() requests requiring a fresh allocation.
+  int64 allocated_count() const NO_THREAD_SAFETY_ANALYSIS {
+    return allocated_count_;
+  }
+  // Number of pool evictions.
+  int64 evicted_count() const NO_THREAD_SAFETY_ANALYSIS {
+    return evicted_count_;
+  }
+  // Current size limit.
+  size_t size_limit() const NO_THREAD_SAFETY_ANALYSIS {
+    return pool_size_limit_;
+  }
+
+ private:
+  struct PtrRecord {
+    void* ptr;
+    size_t num_bytes;
+    PtrRecord* prev;
+    PtrRecord* next;
+  };
+
+  // Remove "pr" from the double-linked LRU list.
+  void RemoveFromList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Add "pr" to the head of the double-linked LRU list.
+  void AddToList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Delete the least recently used record.
+  void EvictOne() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  const string name_;
+  const bool has_size_limit_;
+  const bool auto_resize_;
+  size_t pool_size_limit_;
+  std::unique_ptr<SubAllocator> allocator_;
+  std::unique_ptr<RoundUpInterface> size_rounder_;
+  mutex mutex_;
+  std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_);
+  PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr;
+  PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr;
+  int64 get_from_pool_count_ GUARDED_BY(mutex_) = 0;
+  int64 put_count_ GUARDED_BY(mutex_) = 0;
+  int64 allocated_count_ GUARDED_BY(mutex_) = 0;
+  int64 evicted_count_ GUARDED_BY(mutex_) = 0;
+  // Write access to these is guarded by mutex_, but not read
+  // access. They may only be modified prior to the first
+  // allocation.  Later attempts to modify will fail.
+  std::vector<Visitor> alloc_visitors_;
+  std::vector<Visitor> free_visitors_;
+  std::atomic<bool> allocation_begun_;
+};
+
+// Do-nothing rounder. Passes through sizes unchanged.
+class NoopRounder : public RoundUpInterface {
+ public:
+  size_t RoundUp(size_t num_bytes) override { return num_bytes; }
+};
+
+// Power of 2 rounder: rounds up to nearest power of 2 size.
+class Pow2Rounder : public RoundUpInterface {
+ public:
+  size_t RoundUp(size_t num_bytes) override {
+    return 1uLL << Log2Ceiling64(num_bytes);
+  }
+};
+
+class BasicCPUAllocator : public SubAllocator {
+ public:
+  ~BasicCPUAllocator() override {}
+
+  void* Alloc(size_t alignment, size_t num_bytes) override {
+    return port::aligned_malloc(num_bytes, alignment);
+  }
+  void Free(void* ptr, size_t num_bytes) override { free(ptr); }
+};
+
+// Allocator for pinned CPU RAM that is made known to CUDA for the
+// purpose of efficient DMA with a GPU.
+class CUDAHostAllocator : public SubAllocator {
+ public:
+  // Note: stream_exec cannot be null.
+  explicit CUDAHostAllocator(perftools::gputools::StreamExecutor* stream_exec)
+      : stream_exec_(stream_exec) {
+    CHECK(stream_exec_ != nullptr);
+  }
+  ~CUDAHostAllocator() override {}
+
+  void* Alloc(size_t alignment, size_t num_bytes) override {
+    void* ptr = nullptr;
+    if (num_bytes > 0) {
+      ptr = stream_exec_->HostMemoryAllocate(num_bytes);
+      if (ptr == nullptr) {
+        LOG(FATAL) << "could not allocate pinned host memory of size: "
+                   << num_bytes;
+      }
+    }
+    return ptr;
+  }
+
+  void Free(void* ptr, size_t num_bytes) override {
+    if (ptr != nullptr) {
+      stream_exec_->HostMemoryDeallocate(ptr);
+    }
+  }
+
+ private:
+  perftools::gputools::StreamExecutor* stream_exec_;  // not owned, non-null
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_