Eliminate VisitableAllocator.

The visitor pattern is used to allow pre-registration of memory for DMA access, e.g. for fast GPU/CPU i/o and for RDMA networking. The VisitableAllocator interface was introduced to support this use some time ago, prior to SubAllocators. Memory registration works best if it's done infrequently, on large pieces of memory, rather than on every piece that's dynamically allocated/freed. This usage pattern fits the SubAllocator better than a general Allocator. This change moves memory allocation visitor access to SubAllocator and eliminates the VisitableAllocator subclass of Allocator. This change also more rigorously enforces the requirement that all Visitors be declared prior to memory allocation begining. This is accomplished by requiring that Visitors be provided to the SubAllocator constructor. This refactoring will ease an upcoming CL introducing NUMA specific CPU devices. It also should fix some performance pitfalls (e.g. accidental use of PoolAllocator) introduced by an earlier refactoring of ProcessState that was also in preparation for NUMA. It restores the default use of the cpu_allocator() value (i.e. no SubAllocator) by model executions that don't use allocation visitors (since visitor registration must precede the first allocation, hence can be detected at that time). PiperOrigin-RevId: 213505655
author: A. Unique TensorFlower <gardener@tensorflow.org> 2018-09-18 13:28:38 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-09-18 13:33:19 -0700
commit: 33170cc661f3838aa7d0d7fc19bb0c6ba4812a3c (patch)
tree: c33a2e11456e70c60526e65569956567b4f84cb3 /tensorflow/core/framework
parent: 199cb7746c7ad92d7be344363d8276c45fc7a4b8 (diff)
4 files changed, 57 insertions, 10 deletions
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 2a7ee16a16..84cee5569c 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -196,7 +196,7 @@ class CPUAllocatorFactory : public AllocatorFactory {
   class CPUSubAllocator : public SubAllocator {
    public:
     explicit CPUSubAllocator(CPUAllocator* cpu_allocator)
-        : cpu_allocator_(cpu_allocator) {}
+        : SubAllocator({}, {}), cpu_allocator_(cpu_allocator) {}
 
     void* Alloc(size_t alignment, size_t num_bytes) override {
       return cpu_allocator_->AllocateRaw(alignment, num_bytes);
@@ -222,4 +222,22 @@ Allocator* cpu_allocator() {
   }
   return cpu_alloc;
 }
+
+SubAllocator::SubAllocator(const std::vector<Visitor>& alloc_visitors,
+                           const std::vector<Visitor>& free_visitors)
+    : alloc_visitors_(alloc_visitors), free_visitors_(free_visitors) {}
+
+void SubAllocator::VisitAlloc(void* ptr, int index, size_t num_bytes) {
+  for (const auto& v : alloc_visitors_) {
+    v(ptr, index, num_bytes);
+  }
+}
+
+void SubAllocator::VisitFree(void* ptr, int index, size_t num_bytes) {
+  // Although we don't guarantee any order of visitor application, strive
+  // to apply free visitors in reverse order of alloc visitors.
+  for (int i = free_visitors_.size() - 1; i >= 0; --i) {
+    free_visitors_[i](ptr, index, num_bytes);
+  }
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index ded120b704..8c23604625 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -387,13 +388,36 @@ void EnableCPUAllocatorStats(bool enable);
 // full statistics. By default, it's disabled.
 void EnableCPUAllocatorFullStats(bool enable);
 
-// Abstract interface of an object that does the underlying suballoc/free of
-// memory for a higher-level allocator.
+// An object that does the underlying suballoc/free of memory for a higher-level
+// allocator.  The expectation is that the higher-level allocator is doing some
+// kind of cache or pool management so that it will call SubAllocator::Alloc and
+// Free relatively infrequently, compared to the number of times its own
+// AllocateRaw and Free methods are called.
 class SubAllocator {
  public:
+  // Visitor gets called with a pointer to a memory area and its
+  // size in bytes.  The index value will be numa_node for a CPU
+  // allocator and GPU id for a GPU allocator.
+  typedef std::function<void(void*, int index, size_t)> Visitor;
+
+  SubAllocator(const std::vector<Visitor>& alloc_visitors,
+               const std::vector<Visitor>& free_visitors);
+
   virtual ~SubAllocator() {}
   virtual void* Alloc(size_t alignment, size_t num_bytes) = 0;
   virtual void Free(void* ptr, size_t num_bytes) = 0;
+
+ protected:
+  // Implementation of Alloc() method must call this on newly allocated
+  // value.
+  void VisitAlloc(void* ptr, int index, size_t num_bytes);
+
+  // Implementation of Free() method must call this on value to be
+  // freed immediately before deallocation.
+  void VisitFree(void* ptr, int index, size_t num_bytes);
+
+  const std::vector<Visitor> alloc_visitors_;
+  const std::vector<Visitor> free_visitors_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 794250a2c1..53ac639b4c 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -214,10 +214,12 @@ class DeviceBase {
 
   // This is overridden by GPU devices to reinitialize the derived
   // type returned by MakeGpuDevice.
-  virtual void ReinitializeGpuDevice(OpKernelContext* /*context*/,
-                                     PerOpGpuDevice* /*device*/,
-                                     DeviceContext* /*dc*/,
-                                     Allocator* /*allocator*/) {}
+  virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/,
+                                       PerOpGpuDevice* /*device*/,
+                                       DeviceContext* /*dc*/,
+                                       Allocator* /*allocator*/) {
+    return Status::OK();
+  }
 
   // Unimplemented by default
   virtual const DeviceAttributes& attributes() const;
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 80f2b12987..3e34bf0418 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -265,9 +265,12 @@ OpKernelContext::OpKernelContext(Params* params, int num_outputs)
   params_->ensure_eigen_gpu_device();
   if (params_->eigen_gpu_device != nullptr) {
     Allocator* eigen_gpu_allocator = get_allocator(AllocatorAttributes());
-    params_->device->ReinitializeGpuDevice(this, params_->eigen_gpu_device,
-                                           params_->op_device_context,
-                                           eigen_gpu_allocator);
+    Status s = params_->device->ReinitializeGpuDevice(
+        this, params_->eigen_gpu_device, params_->op_device_context,
+        eigen_gpu_allocator);
+    if (!s.ok()) {
+      SetStatus(s);
+    }
   }
   if (params_->record_tensor_accesses) {
     referenced_tensors_.Init();
author	A. Unique TensorFlower <gardener@tensorflow.org>	2018-09-18 13:28:38 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-09-18 13:33:19 -0700
commit	33170cc661f3838aa7d0d7fc19bb0c6ba4812a3c (patch)
tree	c33a2e11456e70c60526e65569956567b4f84cb3 /tensorflow/core/framework
parent	199cb7746c7ad92d7be344363d8276c45fc7a4b8 (diff)