Eliminate VisitableAllocator.

The visitor pattern is used to allow pre-registration of memory for DMA access, e.g. for fast GPU/CPU i/o and for RDMA networking. The VisitableAllocator interface was introduced to support this use some time ago, prior to SubAllocators. Memory registration works best if it's done infrequently, on large pieces of memory, rather than on every piece that's dynamically allocated/freed. This usage pattern fits the SubAllocator better than a general Allocator. This change moves memory allocation visitor access to SubAllocator and eliminates the VisitableAllocator subclass of Allocator. This change also more rigorously enforces the requirement that all Visitors be declared prior to memory allocation begining. This is accomplished by requiring that Visitors be provided to the SubAllocator constructor. This refactoring will ease an upcoming CL introducing NUMA specific CPU devices. It also should fix some performance pitfalls (e.g. accidental use of PoolAllocator) introduced by an earlier refactoring of ProcessState that was also in preparation for NUMA. It restores the default use of the cpu_allocator() value (i.e. no SubAllocator) by model executions that don't use allocation visitors (since visitor registration must precede the first allocation, hence can be detected at that time). PiperOrigin-RevId: 213371553
author: A. Unique TensorFlower <gardener@tensorflow.org> 2018-09-17 17:49:36 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-09-17 17:53:46 -0700
commit: 185aa89912376d4088c22615908696cd30f9951b (patch)
tree: ba465945c19904129b830cc2465a3ee7b58b9247 /tensorflow/contrib/verbs
parent: 4338803b98cd825b0b1d810bcc51c9a79734feb6 (diff)
3 files changed, 30 insertions, 57 deletions
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 3cb5e61fac..2784bf124c 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/contrib/verbs/grpc_verbs_client.h"
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
-#include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/pool_allocator.h"
@@ -29,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 
@@ -256,74 +256,41 @@ void MRDeleter(ibv_mr* mr) {
   }
 }
 
-// TODO(byronyi): remove this class and its registration when the default
-// cpu_allocator() returns visitable allocator, or cpu_allocator() is no
-// longer in use.
-class BFCRdmaAllocator : public BFCAllocator {
- public:
-  BFCRdmaAllocator()
-      : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36,
-                     true, "cpu_rdma_bfc") {}
-};
-class BFCRdmaAllocatorFactory : public AllocatorFactory {
- public:
-  Allocator* CreateAllocator() { return new BFCRdmaAllocator; }
-
-  SubAllocator* CreateSubAllocator(int numa_node) {
-    return new BasicCPUAllocator(numa_node);
-  }
-};
-
-REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocatorFactory);
-
 void RdmaMgr::InitAllocators() {
-  RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_;
+  static std::once_flag flag;
+  std::call_once(
+      flag, [this]() { RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_; });
+}
 
-  Allocator* allocators[] = {
-#if GOOGLE_CUDA
-    GPUProcessState::singleton()->GetCUDAHostAllocator(0),
-#endif  // GOOGLE_CUDA
-    ProcessState::singleton()->GetCPUAllocator(0),
-    cpu_allocator(),
+/*static*/ void RdmaMgr::RegMemVisitors() {
+  SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node,
+                                           size_t num_bytes) {
+    RdmaMemoryMgr::Singleton().InsertMemoryRegion(
+        ptr, num_bytes, strings::StrCat("CPU:", numa_node));
+  };
+  SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node,
+                                          size_t num_bytes) {
+    RdmaMemoryMgr::Singleton().EvictMemoryRegion(ptr, num_bytes);
   };
 
-  using namespace std::placeholders;
-
-  std::set<Allocator*> instrumented_;
-
-  // Host memory allocators
-  for (Allocator* allocator : allocators) {
-    VisitableAllocator::Visitor alloc_visitor =
-        std::bind(&RdmaMemoryMgr::InsertMemoryRegion,
-                  &RdmaMemoryMgr::Singleton(), _1, _2, allocator->Name());
-    VisitableAllocator::Visitor free_visitor = std::bind(
-        &RdmaMemoryMgr::EvictMemoryRegion, &RdmaMemoryMgr::Singleton(), _1, _2);
-
-    auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
-    CHECK(visitable_allocator)
-        << "is not visitable for instrumentation" << allocator->Name();
-    // Make sure we don't instrument the same allocator twice
-    if (instrumented_.find(allocator) == std::end(instrumented_)) {
-      visitable_allocator->AddAllocVisitor(alloc_visitor);
-      visitable_allocator->AddFreeVisitor(free_visitor);
-      instrumented_.insert(allocator);
-      LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name();
-    }
-  }
+  ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
+  ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
 
 #if GOOGLE_CUDA
   if (IsGDRAvailable()) {
     // Note we don't free allocated GPU memory so there is no free visitor
     int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1;
 
-    char buf[8];
-    sprintf(buf, "gpu");
-    VisitableAllocator::Visitor cuda_alloc_visitor =
-        std::bind(&RdmaMemoryMgr::InsertMemoryRegion,
-                  &RdmaMemoryMgr::Singleton(), _1, _2, std::string(buf));
-
+    SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id,
+                                                  size_t num_bytes) {
+      RdmaMemoryMgr::Singleton().InsertMemoryRegion(
+          ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
+    };
     GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
                                                      cuda_alloc_visitor);
+    GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id,
+                                                          alloc_visitor);
+    GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor);
     LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
   }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h
index 9fffc335bb..74b92cc9a6 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.h
+++ b/tensorflow/contrib/verbs/rdma_mgr.h
@@ -39,6 +39,7 @@ class RdmaMgr {
   void SetupChannels();
   bool ConnectivityCheck();
   void InitAllocators();
+  static void RegMemVisitors();
   const string& local_worker() { return local_worker_; }
 
  private:
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index 1a0b5028fe..61469686e4 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -76,8 +76,13 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
   return Status::OK();
 }
 
+namespace {
+std::once_call reg_mem_visitors_call;
+}  // namespace
+
 Status VerbsServer::Init(ServiceInitFunction service_func,
                          RendezvousMgrCreationFunction rendezvous_mgr_func) {
+  std::call_once(reg_mem_visitors_call, []() { RdmaMgr::RegMemVisitors(); });
   Status s = GrpcServer::Init(service_func, rendezvous_mgr_func);
   {
     mutex_lock l(mu_);
author	A. Unique TensorFlower <gardener@tensorflow.org>	2018-09-17 17:49:36 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-09-17 17:53:46 -0700
commit	185aa89912376d4088c22615908696cd30f9951b (patch)
tree	ba465945c19904129b830cc2465a3ee7b58b9247 /tensorflow/contrib/verbs
parent	4338803b98cd825b0b1d810bcc51c9a79734feb6 (diff)