32 files changed, 628 insertions, 577 deletions
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index 726f74c7b7..bb06f1c41c 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -138,6 +138,8 @@ class GdrMemoryManager : public RemoteMemoryManager {
       Device* device, DeviceContext* device_context, bool on_host,
       StatusCallback done) override;
 
+  static void RegMemVisitors();
+
  protected:
   Status CreateEndpoint(const string& host, const string& port,
                         RdmaEndpointPtr& endpoint);
@@ -183,35 +185,51 @@ class GdrMemoryManager : public RemoteMemoryManager {
   TF_DISALLOW_COPY_AND_ASSIGN(GdrMemoryManager);
 };
 
-// TODO(byronyi): remove this class and its registration when the default
-// cpu_allocator() returns visitable allocator, or cpu_allocator() is no
-// longer in use.
-class BFCGdrAllocator : public BFCAllocator {
- public:
-  BFCGdrAllocator()
-      : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36,
-                     true, "cpu_gdr_bfc") {}
-};
-class BFCGdrAllocatorFactory : public AllocatorFactory {
- public:
-  Allocator* CreateAllocator() override { return new BFCGdrAllocator; }
-
-  virtual SubAllocator* CreateSubAllocator(int numa_node) {
-    return new BasicCPUAllocator(numa_node);
-  }
-};
-
-REGISTER_MEM_ALLOCATOR("BFCGdrAllocator", 102, BFCGdrAllocatorFactory);
-
 GdrMemoryManager::GdrMemoryManager(const string& host, const string& port)
     : host_(host),
       port_(port),
       listening_(nullptr, EndpointDeleter),
       stopped_(true),
-      next_key_(0) {}
+      next_key_(0) {
+  static std::once_flag flag;
+  std::call_once(flag, []() { RegMemVisitors(); });
+}
 
 GdrMemoryManager::~GdrMemoryManager() { close(epfd_); }
 
+/*static*/ void GdrMemoryManager::RegMemVisitors() {
+  SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node,
+                                           size_t num_bytes) {
+    GdrMemoryManager::Singleton().InsertMemoryRegion(
+        ptr, num_bytes, strings::StrCat("CPU:", numa_node));
+  };
+  SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node,
+                                          size_t num_bytes) {
+    GdrMemoryManager::Singleton().EvictMemoryRegion(ptr, num_bytes);
+  };
+  ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
+  ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
+
+#if GOOGLE_CUDA
+  if (IsGDRAvailable()) {
+    int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1;
+
+    // Note we don't free allocated GPU memory so there is no free visitor
+    SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id,
+                                                  size_t num_bytes) {
+      RdmaMemoryMgr::Singleton().InsertMemoryRegion(
+          ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
+    };
+    GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
+                                                     cuda_alloc_visitor);
+    GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id,
+                                                          alloc_visitor);
+    GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor);
+    LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
+  }
+#endif  // GOOGLE_CUDA
+}
+
 Status GdrMemoryManager::Init() {
   epfd_ = epoll_create1(0);
   if (epfd_ == -1) {
@@ -271,48 +289,6 @@ Status GdrMemoryManager::Init() {
                                "cannot add server to epoll");
   }
 
-  Allocator* allocators[] = {
-#if GOOGLE_CUDA
-    GPUProcessState::singleton()->GetCUDAHostAllocator(0),
-#endif  // GOOGLE_CUDA
-    ProcessState::singleton()->GetCPUAllocator(0),
-    cpu_allocator(),
-  };
-
-  using namespace std::placeholders;
-  VisitableAllocator::Visitor alloc_visitor =
-      std::bind(&GdrMemoryManager::InsertMemoryRegion, this, _1, _2);
-  VisitableAllocator::Visitor free_visitor =
-      std::bind(&GdrMemoryManager::EvictMemoryRegion, this, _1, _2);
-
-  std::set<Allocator*> instrumented_;
-
-  // Host memory allocators
-  for (Allocator* allocator : allocators) {
-    auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
-    CHECK(visitable_allocator)
-        << "is not visitable for instrumentation" << allocator->Name();
-    // Make sure we don't instrument the same allocator twice
-    if (instrumented_.find(allocator) == std::end(instrumented_)) {
-      visitable_allocator->AddAllocVisitor(alloc_visitor);
-      visitable_allocator->AddFreeVisitor(free_visitor);
-      instrumented_.insert(allocator);
-      LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name();
-    }
-  }
-
-#if GOOGLE_CUDA
-  VisitableAllocator::Visitor cuda_alloc_visitor =
-      std::bind(&GdrMemoryManager::InsertMemoryRegion, this, _1, _2);
-  if (IsGDRAvailable()) {
-    // Note we don't free allocated GPU memory so there is no free visitor
-    int32_t bus_id = TryToReadNumaNode(listening_->verbs->device) + 1;
-    GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
-                                                     cuda_alloc_visitor);
-    LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
-  }
-#endif  // GOOGLE_CUDA
-
   return Status::OK();
 }
 
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 3cb5e61fac..2784bf124c 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/contrib/verbs/grpc_verbs_client.h"
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
-#include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/pool_allocator.h"
@@ -29,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 
@@ -256,74 +256,41 @@ void MRDeleter(ibv_mr* mr) {
   }
 }
 
-// TODO(byronyi): remove this class and its registration when the default
-// cpu_allocator() returns visitable allocator, or cpu_allocator() is no
-// longer in use.
-class BFCRdmaAllocator : public BFCAllocator {
- public:
-  BFCRdmaAllocator()
-      : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36,
-                     true, "cpu_rdma_bfc") {}
-};
-class BFCRdmaAllocatorFactory : public AllocatorFactory {
- public:
-  Allocator* CreateAllocator() { return new BFCRdmaAllocator; }
-
-  SubAllocator* CreateSubAllocator(int numa_node) {
-    return new BasicCPUAllocator(numa_node);
-  }
-};
-
-REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocatorFactory);
-
 void RdmaMgr::InitAllocators() {
-  RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_;
+  static std::once_flag flag;
+  std::call_once(
+      flag, [this]() { RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_; });
+}
 
-  Allocator* allocators[] = {
-#if GOOGLE_CUDA
-    GPUProcessState::singleton()->GetCUDAHostAllocator(0),
-#endif  // GOOGLE_CUDA
-    ProcessState::singleton()->GetCPUAllocator(0),
-    cpu_allocator(),
+/*static*/ void RdmaMgr::RegMemVisitors() {
+  SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node,
+                                           size_t num_bytes) {
+    RdmaMemoryMgr::Singleton().InsertMemoryRegion(
+        ptr, num_bytes, strings::StrCat("CPU:", numa_node));
+  };
+  SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node,
+                                          size_t num_bytes) {
+    RdmaMemoryMgr::Singleton().EvictMemoryRegion(ptr, num_bytes);
   };
 
-  using namespace std::placeholders;
-
-  std::set<Allocator*> instrumented_;
-
-  // Host memory allocators
-  for (Allocator* allocator : allocators) {
-    VisitableAllocator::Visitor alloc_visitor =
-        std::bind(&RdmaMemoryMgr::InsertMemoryRegion,
-                  &RdmaMemoryMgr::Singleton(), _1, _2, allocator->Name());
-    VisitableAllocator::Visitor free_visitor = std::bind(
-        &RdmaMemoryMgr::EvictMemoryRegion, &RdmaMemoryMgr::Singleton(), _1, _2);
-
-    auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
-    CHECK(visitable_allocator)
-        << "is not visitable for instrumentation" << allocator->Name();
-    // Make sure we don't instrument the same allocator twice
-    if (instrumented_.find(allocator) == std::end(instrumented_)) {
-      visitable_allocator->AddAllocVisitor(alloc_visitor);
-      visitable_allocator->AddFreeVisitor(free_visitor);
-      instrumented_.insert(allocator);
-      LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name();
-    }
-  }
+  ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
+  ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
 
 #if GOOGLE_CUDA
   if (IsGDRAvailable()) {
     // Note we don't free allocated GPU memory so there is no free visitor
     int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1;
 
-    char buf[8];
-    sprintf(buf, "gpu");
-    VisitableAllocator::Visitor cuda_alloc_visitor =
-        std::bind(&RdmaMemoryMgr::InsertMemoryRegion,
-                  &RdmaMemoryMgr::Singleton(), _1, _2, std::string(buf));
-
+    SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id,
+                                                  size_t num_bytes) {
+      RdmaMemoryMgr::Singleton().InsertMemoryRegion(
+          ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
+    };
     GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
                                                      cuda_alloc_visitor);
+    GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id,
+                                                          alloc_visitor);
+    GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor);
     LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
   }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h
index 9fffc335bb..74b92cc9a6 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.h
+++ b/tensorflow/contrib/verbs/rdma_mgr.h
@@ -39,6 +39,7 @@ class RdmaMgr {
   void SetupChannels();
   bool ConnectivityCheck();
   void InitAllocators();
+  static void RegMemVisitors();
   const string& local_worker() { return local_worker_; }
 
  private:
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index 1a0b5028fe..61469686e4 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -76,8 +76,13 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
   return Status::OK();
 }
 
+namespace {
+std::once_call reg_mem_visitors_call;
+}  // namespace
+
 Status VerbsServer::Init(ServiceInitFunction service_func,
                          RendezvousMgrCreationFunction rendezvous_mgr_func) {
+  std::call_once(reg_mem_visitors_call, []() { RdmaMgr::RegMemVisitors(); });
   Status s = GrpcServer::Init(service_func, rendezvous_mgr_func);
   {
     mutex_lock l(mu_);
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index d55bd8d7ed..9bcf5b0865 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2783,7 +2783,6 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/step_stats_collector.h",
     "common_runtime/threadpool_device.h",
     "common_runtime/tracing_device.h",
-    "common_runtime/visitable_allocator.h",
     "common_runtime/process_state.h",
     "common_runtime/pool_allocator.h",
     "graph/gradients.h",
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 84c6285bbe..3843ea9e60 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -31,7 +31,7 @@ namespace tensorflow {
 
 BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
                            bool allow_growth, const string& name)
-    : suballocator_(sub_allocator),
+    : sub_allocator_(sub_allocator),
       name_(name),
       free_chunks_list_(kInvalidChunkHandle),
       next_allocation_id_(1) {
@@ -72,7 +72,7 @@ BFCAllocator::~BFCAllocator() {
   VLOG(2) << "Number of regions allocated: "
           << region_manager_.regions().size();
   for (const auto& region : region_manager_.regions()) {
-    suballocator_->Free(region.ptr(), region.memory_size());
+    sub_allocator_->Free(region.ptr(), region.memory_size());
   }
 
   for (BinNum b = 0; b < kNumBins; b++) {
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
 
   // Try allocating.
   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
-  void* mem_addr = suballocator_->Alloc(alignment, bytes);
+  void* mem_addr = sub_allocator_->Alloc(alignment, bytes);
   if (mem_addr == nullptr && !started_backpedal_) {
     // Only backpedal once.
     started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
     while (mem_addr == nullptr) {
       bytes = RoundedBytes(bytes * kBackpedalFactor);
       if (bytes < rounded_bytes) break;
-      mem_addr = suballocator_->Alloc(alignment, bytes);
+      mem_addr = sub_allocator_->Alloc(alignment, bytes);
     }
   }
 
@@ -158,10 +158,6 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   // Insert the chunk into the right bin.
   InsertFreeChunkIntoBin(h);
 
-  // Invoke visitors on newly allocated region.
-  for (const auto& visitor : region_visitors_) {
-    visitor(mem_addr, bytes);
-  }
   return true;
 }
 
@@ -490,15 +486,6 @@ void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) {
   InsertFreeChunkIntoBin(coalesced_chunk);
 }
 
-void BFCAllocator::AddAllocVisitor(Visitor visitor) {
-  VLOG(1) << "AddVisitor";
-  mutex_lock l(lock_);
-  region_visitors_.push_back(visitor);
-  for (const auto& region : region_manager_.regions()) {
-    visitor(region.ptr(), region.memory_size());
-  }
-}
-
 bool BFCAllocator::TracksAllocationSizes() { return true; }
 
 size_t BFCAllocator::RequestedSize(const void* ptr) {
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 20e1dab1d5..364071e066 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
@@ -42,7 +42,7 @@ namespace tensorflow {
 // coalescing.  One assumption we make is that the process using this
 // allocator owns pretty much all of the memory, and that nearly
 // all requests to allocate memory go through this interface.
-class BFCAllocator : public VisitableAllocator {
+class BFCAllocator : public Allocator {
  public:
   // Takes ownership of sub_allocator.
   BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
@@ -55,11 +55,6 @@ class BFCAllocator : public VisitableAllocator {
                     const AllocationAttributes& allocation_attr) override;
   void DeallocateRaw(void* ptr) override;
 
-  void AddAllocVisitor(Visitor visitor) override;
-
-  // Does nothing, because memory is never freed.
-  void AddFreeVisitor(Visitor visitor) override {}
-
   bool TracksAllocationSizes() override;
 
   size_t RequestedSize(const void* ptr) override;
@@ -423,7 +418,7 @@ class BFCAllocator : public VisitableAllocator {
   // of the available memory.
   bool started_backpedal_ = false;
 
-  std::unique_ptr<SubAllocator> suballocator_;
+  std::unique_ptr<SubAllocator> sub_allocator_;
   string name_;
 
   // Structures mutable after construction
@@ -435,9 +430,6 @@ class BFCAllocator : public VisitableAllocator {
   // Pointer to head of linked list of free Chunks
   ChunkHandle free_chunks_list_ GUARDED_BY(lock_);
 
-  // Called once on each region, ASAP.
-  std::vector<Visitor> region_visitors_ GUARDED_BY(lock_);
-
   // Counter containing the next unique identifier to assign to a
   // newly-created chunk.
   int64 next_allocation_id_ GUARDED_BY(lock_);
diff --git a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
index 636cd43575..6bd29ef775 100644
--- a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
@@ -26,8 +26,12 @@ namespace tensorflow {
 class CUDAHostAllocator : public SubAllocator {
  public:
   // Note: stream_exec cannot be null.
-  explicit CUDAHostAllocator(se::StreamExecutor* stream_exec)
-      : stream_exec_(stream_exec) {
+  explicit CUDAHostAllocator(se::StreamExecutor* stream_exec, int numa_node,
+                             const std::vector<Visitor>& alloc_visitors,
+                             const std::vector<Visitor>& free_visitors)
+      : SubAllocator(alloc_visitors, free_visitors),
+        stream_exec_(stream_exec),
+        numa_node_(numa_node) {
     CHECK(stream_exec_ != nullptr);
   }
   ~CUDAHostAllocator() override {}
@@ -39,19 +43,23 @@ class CUDAHostAllocator : public SubAllocator {
       if (ptr == nullptr) {
         LOG(WARNING) << "could not allocate pinned host memory of size: "
                      << num_bytes;
+        return ptr;
       }
+      VisitAlloc(ptr, numa_node_, num_bytes);
     }
     return ptr;
   }
 
   void Free(void* ptr, size_t num_bytes) override {
     if (ptr != nullptr) {
+      VisitFree(ptr, numa_node_, num_bytes);
       stream_exec_->HostMemoryDeallocate(ptr);
     }
   }
 
  private:
   se::StreamExecutor* stream_exec_;  // not owned, non-null
+  const int numa_node_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index 2d4c8d0201..44ffce77a1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -22,18 +22,15 @@ limitations under the License.
 
 namespace tensorflow {
 
-GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
-                                 const string& name)
-    : GPUBFCAllocator(cuda_gpu_id, total_memory, GPUOptions(), name) {}
+GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator,
+                                 size_t total_memory, const string& name)
+    : GPUBFCAllocator(sub_allocator, total_memory, GPUOptions(), name) {}
 
-GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator,
+                                 size_t total_memory,
                                  const GPUOptions& gpu_options,
                                  const string& name)
-    : BFCAllocator(
-          new GPUMemAllocator(
-              GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(),
-              gpu_options.per_process_gpu_memory_fraction() > 1.0 ||
-                  gpu_options.experimental().use_unified_memory()),
-          total_memory, gpu_options.allow_growth(), name) {}
+    : BFCAllocator(sub_allocator, total_memory, gpu_options.allow_growth(),
+                   name) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index f1cc2eace1..6b6de80734 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -31,28 +31,20 @@ limitations under the License.
 
 namespace tensorflow {
 
-// A GPU memory allocator that implements a 'best-fit with coalescing'
-// algorithm.
-class GPUBFCAllocator : public BFCAllocator {
- public:
-  // 'cuda_gpu_id' refers to the ID of the GPU device within
-  // the process and must reference a valid ID in the process.
-  GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
-                  const string& name);
-  GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
-                  const GPUOptions& gpu_options, const string& name);
-  virtual ~GPUBFCAllocator() {}
-
-  TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
-};
-
 // Suballocator for GPU memory.
 class GPUMemAllocator : public SubAllocator {
  public:
+  // 'cuda_gpu_id' refers to the ID of the GPU device within
+  // the process and must reference a valid ID in the process.
   // Note: stream_exec cannot be null.
-  explicit GPUMemAllocator(se::StreamExecutor* stream_exec,
-                           bool use_unified_memory)
-      : stream_exec_(stream_exec), use_unified_memory_(use_unified_memory) {
+  explicit GPUMemAllocator(se::StreamExecutor* stream_exec, CudaGpuId gpu_id,
+                           bool use_unified_memory,
+                           const std::vector<Visitor>& alloc_visitors,
+                           const std::vector<Visitor>& free_visitors)
+      : SubAllocator(alloc_visitors, free_visitors),
+        stream_exec_(stream_exec),
+        gpu_id_(gpu_id),
+        use_unified_memory_(use_unified_memory) {
     CHECK(stream_exec_ != nullptr);
   }
   ~GPUMemAllocator() override {}
@@ -65,12 +57,14 @@ class GPUMemAllocator : public SubAllocator {
       } else {
         ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque();
       }
+      VisitAlloc(ptr, gpu_id_.value(), num_bytes);
     }
     return ptr;
   }
 
   void Free(void* ptr, size_t num_bytes) override {
     if (ptr != nullptr) {
+      VisitFree(ptr, gpu_id_.value(), num_bytes);
       if (use_unified_memory_) {
         stream_exec_->UnifiedMemoryDeallocate(ptr);
       } else {
@@ -82,11 +76,25 @@ class GPUMemAllocator : public SubAllocator {
 
  private:
   se::StreamExecutor* stream_exec_;  // not owned, non-null
+  const CudaGpuId gpu_id_;
   const bool use_unified_memory_ = false;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator);
 };
 
+// A GPU memory allocator that implements a 'best-fit with coalescing'
+// algorithm.
+class GPUBFCAllocator : public BFCAllocator {
+ public:
+  GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory,
+                  const string& name);
+  GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory,
+                  const GPUOptions& gpu_options, const string& name);
+  ~GPUBFCAllocator() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 67caeb3495..7112c3afd4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -46,7 +47,11 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
 }
 
 TEST(GPUBFCAllocatorTest, NoDups) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   CheckStats(&a, 0, 0, 0, 0);
 
   // Allocate a lot of raw pointers
@@ -75,7 +80,11 @@ TEST(GPUBFCAllocatorTest, NoDups) {
 }
 
 TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   // Allocate 256 raw pointers of sizes between 100 bytes and about
   // a meg
   random::PhiloxRandom philox(123, 17);
@@ -133,7 +142,11 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
 }
 
 TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   CheckStats(&a, 0, 0, 0, 0);
 
   float* first_ptr = a.Allocate<float>(1024);
@@ -168,18 +181,30 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
 }
 
 TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   float* ptr = a.Allocate<float>(0);
   EXPECT_EQ(nullptr, ptr);
 }
 
 TEST(GPUBFCAllocatorTest, TracksSizes) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   EXPECT_EQ(true, a.TracksAllocationSizes());
 }
 
 TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   float* t1 = a.Allocate<float>(1);
   EXPECT_EQ(4, a.RequestedSize(t1));
   EXPECT_EQ(256, a.AllocatedSize(t1));
@@ -187,8 +212,12 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
 }
 
 TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
   // Configure a 1MiB byte limit
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc");
+  GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc");
 
   float* first_ptr = a.Allocate<float>(1 << 6);
   float* second_ptr = a.Allocate<float>(1 << 20);
@@ -203,7 +232,11 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
   options.set_allow_growth(true);
 
   // Max of 2GiB, but starts out small.
-  GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc");
 
   // Allocate 10 raw pointers of sizes between 100 bytes and about
   // 64 megs.
@@ -264,8 +297,15 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
 }
 
 TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
-  GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
-  GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc");
+  sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc");
   void* amem = a.AllocateRaw(1, 1);
   void* bmem = b.AllocateRaw(1, 1 << 30);
   a.DeallocateRaw(amem);
@@ -273,7 +313,11 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
 }
 
 static void BM_Allocation(int iters) {
-  GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
   // Exercise a few different allocation sizes
   std::vector<size_t> sizes = {256,        4096,      16384,    524288,
                                512,        1048576,   10485760, 104857600,
@@ -289,7 +333,11 @@ static void BM_Allocation(int iters) {
 BENCHMARK(BM_Allocation);
 
 static void BM_AllocationThreaded(int iters, int num_threads) {
-  GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
   thread::ThreadPool pool(Env::Default(), "test", num_threads);
   std::atomic_int_fast32_t count(iters);
   mutex done_lock;
@@ -325,7 +373,11 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16);
 // A more complex benchmark that defers deallocation of an object for
 // "delay" allocations.
 static void BM_AllocationDelayed(int iters, int delay) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   // Exercise a few different allocation sizes
   std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024};
   int size_index = 0;
@@ -363,7 +415,11 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
   // only methods inside this class can access private members of BFCAllocator.
 
   void TestBinDebugInfo() {
-    GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+    CudaGpuId cuda_gpu_id(0);
+    GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+        GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+        false /*use_unified_memory*/, {}, {});
+    GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
 
     std::vector<void*> initial_ptrs;
     std::vector<size_t> initial_ptrs_allocated_sizes;
@@ -441,7 +497,11 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
   }
 
   void TestLog2FloorNonZeroSlow() {
-    GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc");
+    CudaGpuId cuda_gpu_id(0);
+    GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+        GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+        false /*use_unified_memory*/, {}, {});
+    GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc");
     EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0));
     EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1));
     EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 934a57a5fb..8e14f1ea75 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -27,7 +27,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator,
+GPUcudaMallocAllocator::GPUcudaMallocAllocator(Allocator* allocator,
                                                CudaGpuId cuda_gpu_id)
     : base_allocator_(allocator) {
   stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -60,14 +60,6 @@ void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) {
 #endif  // GOOGLE_CUDA
 }
 
-void GPUcudaMallocAllocator::AddAllocVisitor(Visitor visitor) {
-  return base_allocator_->AddAllocVisitor(visitor);
-}
-
-void GPUcudaMallocAllocator::AddFreeVisitor(Visitor visitor) {
-  return base_allocator_->AddFreeVisitor(visitor);
-}
-
 bool GPUcudaMallocAllocator::TracksAllocationSizes() { return false; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 856fdc34b4..3d1d0ef481 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
@@ -29,20 +29,17 @@ namespace tensorflow {
 // An allocator that wraps a GPU allocator and adds debugging
 // functionality that verifies that users do not write outside their
 // allocated memory.
-class GPUcudaMallocAllocator : public VisitableAllocator {
+class GPUcudaMallocAllocator : public Allocator {
  public:
-  explicit GPUcudaMallocAllocator(VisitableAllocator* allocator,
-                                  CudaGpuId cuda_gpu_id);
+  explicit GPUcudaMallocAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id);
   ~GPUcudaMallocAllocator() override;
   string Name() override { return "gpu_debug"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
-  void AddAllocVisitor(Visitor visitor) override;
-  void AddFreeVisitor(Visitor visitor) override;
   bool TracksAllocationSizes() override;
 
  private:
-  VisitableAllocator* base_allocator_ = nullptr;  // owned
+  Allocator* base_allocator_ = nullptr;  // owned
 
   se::StreamExecutor* stream_exec_;  // Not owned.
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index e4c834b30d..6bad66dcec 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -73,7 +73,7 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
 // -----------------------------------------------------------------------------
 // GPUDebugAllocator
 // -----------------------------------------------------------------------------
-GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator,
+GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
                                      CudaGpuId cuda_gpu_id)
     : base_allocator_(allocator) {
   stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -111,14 +111,6 @@ void GPUDebugAllocator::DeallocateRaw(void* ptr) {
   base_allocator_->DeallocateRaw(ptr);
 }
 
-void GPUDebugAllocator::AddAllocVisitor(Visitor visitor) {
-  return base_allocator_->AddAllocVisitor(visitor);
-}
-
-void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) {
-  return base_allocator_->AddFreeVisitor(visitor);
-}
-
 bool GPUDebugAllocator::TracksAllocationSizes() { return true; }
 
 size_t GPUDebugAllocator::RequestedSize(const void* ptr) {
@@ -158,7 +150,7 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) {
 // -----------------------------------------------------------------------------
 // GPUNanResetAllocator
 // -----------------------------------------------------------------------------
-GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator,
+GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator,
                                            CudaGpuId cuda_gpu_id)
     : base_allocator_(allocator) {
   stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -200,14 +192,6 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
   base_allocator_->DeallocateRaw(ptr);
 }
 
-void GPUNanResetAllocator::AddAllocVisitor(Visitor visitor) {
-  return base_allocator_->AddAllocVisitor(visitor);
-}
-
-void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) {
-  return base_allocator_->AddFreeVisitor(visitor);
-}
-
 size_t GPUNanResetAllocator::RequestedSize(const void* ptr) {
   return base_allocator_->RequestedSize(ptr);
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 0f9b72040c..0f27ff4384 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
@@ -31,16 +31,13 @@ namespace tensorflow {
 // An allocator that wraps a GPU allocator and adds debugging
 // functionality that verifies that users do not write outside their
 // allocated memory.
-class GPUDebugAllocator : public VisitableAllocator {
+class GPUDebugAllocator : public Allocator {
  public:
-  explicit GPUDebugAllocator(VisitableAllocator* allocator,
-                             CudaGpuId cuda_gpu_id);
+  explicit GPUDebugAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id);
   ~GPUDebugAllocator() override;
   string Name() override { return "gpu_debug"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
-  void AddAllocVisitor(Visitor visitor) override;
-  void AddFreeVisitor(Visitor visitor) override;
   bool TracksAllocationSizes() override;
   size_t RequestedSize(const void* ptr) override;
   size_t AllocatedSize(const void* ptr) override;
@@ -53,7 +50,7 @@ class GPUDebugAllocator : public VisitableAllocator {
   bool CheckFooter(void* ptr);
 
  private:
-  VisitableAllocator* base_allocator_ = nullptr;  // owned
+  Allocator* base_allocator_ = nullptr;  // owned
 
   se::StreamExecutor* stream_exec_;  // Not owned.
 
@@ -63,23 +60,20 @@ class GPUDebugAllocator : public VisitableAllocator {
 // An allocator that wraps a GPU allocator and resets the memory on
 // allocation and free to 'NaN', helping to identify cases where the
 // user forgets to initialize the memory.
-class GPUNanResetAllocator : public VisitableAllocator {
+class GPUNanResetAllocator : public Allocator {
  public:
-  explicit GPUNanResetAllocator(VisitableAllocator* allocator,
-                                CudaGpuId cuda_gpu_id);
+  explicit GPUNanResetAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id);
   ~GPUNanResetAllocator() override;
   string Name() override { return "gpu_nan_reset"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
-  void AddAllocVisitor(Visitor visitor) override;
-  void AddFreeVisitor(Visitor visitor) override;
   size_t RequestedSize(const void* ptr) override;
   size_t AllocatedSize(const void* ptr) override;
   void GetStats(AllocatorStats* stats) override;
   void ClearStats() override;
 
  private:
-  VisitableAllocator* base_allocator_ = nullptr;  // owned
+  Allocator* base_allocator_ = nullptr;  // owned
 
   se::StreamExecutor* stream_exec_;  // Not owned.
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index 236a0afa0b..98283cd846 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -35,7 +35,10 @@ namespace {
 
 TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
   const CudaGpuId cuda_gpu_id(0);
-  GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
                       cuda_gpu_id);
   auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
@@ -59,7 +62,10 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
     EXPECT_DEATH(
         {
           const CudaGpuId cuda_gpu_id(0);
-          GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+          GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+              GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(),
+              cuda_gpu_id, false /*use_unified_memory*/, {}, {});
+          GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
                               cuda_gpu_id);
           auto stream_exec =
               GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -92,7 +98,10 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
     EXPECT_DEATH(
         {
           const CudaGpuId cuda_gpu_id(0);
-          GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+          GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+              GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(),
+              cuda_gpu_id, false /*use_unified_memory*/, {}, {});
+          GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
                               cuda_gpu_id);
           auto stream_exec =
               GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -122,7 +131,10 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
 
 TEST(GPUDebugAllocatorTest, ResetToNan) {
   const CudaGpuId cuda_gpu_id(0);
-  GPUNanResetAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
                          cuda_gpu_id);
   auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
@@ -163,8 +175,11 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
 TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
   const CudaGpuId cuda_gpu_id(0);
   // NaN reset must be the outer-most allocator.
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
   GPUNanResetAllocator a(
-      new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+      new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
                             cuda_gpu_id),
       cuda_gpu_id);
   auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -205,15 +220,21 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
 
 TEST(GPUDebugAllocatorTest, TracksSizes) {
   const CudaGpuId cuda_gpu_id(0);
-  GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
                       cuda_gpu_id);
   EXPECT_EQ(true, a.TracksAllocationSizes());
 }
 
 TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
   const CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
   GPUNanResetAllocator a(
-      new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+      new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
                             cuda_gpu_id),
       cuda_gpu_id);
   float* t1 = a.Allocate<float>(1);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 2763ac0d4a..50e61b7e00 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/common_runtime/local_device.h"
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -285,6 +284,38 @@ BaseGPUDevice::~BaseGPUDevice() {
   for (auto ctx : device_contexts_) ctx->Unref();
 }
 
+// This should be idempotent if already initialized.
+Status BaseGPUDevice::InitScratchBuffers() {
+  mutex_lock l(scratch_init_mutex_);
+  if (scratch_.size() < max_streams_) {
+    for (int i = 0; i < max_streams_; i++) {
+      DCHECK(streams_[i]);
+      if (scratch_.size() > i && scratch_[i]) continue;
+      size_t scratch_buffer_size =
+          Eigen::kCudaScratchSize + sizeof(unsigned int);
+      void* scratch_buffer = gpu_allocator_->AllocateRaw(
+          Allocator::kAllocatorAlignment, scratch_buffer_size);
+      if (scratch_buffer == nullptr) {
+        return errors::FailedPrecondition(
+            "Failed to allocate scratch buffer for device ",
+            tf_gpu_id_.value());
+      }
+      se::DeviceMemory<char> mem(
+          se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
+
+      bool ok = executor_->SynchronousMemZero(
+          &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
+      if (!ok) {
+        return errors::FailedPrecondition(
+            "Failed to memcopy into scratch buffer for device ",
+            tf_gpu_id_.value());
+      }
+      scratch_.push_back(static_cast<char*>(scratch_buffer));
+    }
+  }
+  return Status::OK();
+}
+
 Status BaseGPUDevice::Init(const SessionOptions& options) {
   auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_);
   if (!executor_status.status().ok()) {
@@ -303,27 +334,6 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
   for (int i = 0; i < max_streams_; i++) {
     streams_.push_back(StreamGroupFactory::Global().GetOrCreate(
         tf_gpu_id_, i, executor_, options.config.gpu_options()));
-
-    size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
-    void* scratch_buffer = gpu_allocator_->AllocateRaw(
-        Allocator::kAllocatorAlignment, scratch_buffer_size);
-    if (scratch_buffer == nullptr) {
-      return errors::FailedPrecondition(
-          "Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
-    }
-    scratch_.push_back(static_cast<char*>(scratch_buffer));
-
-    se::DeviceMemory<char> mem(
-        se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
-
-    bool ok = executor_->SynchronousMemZero(
-        &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
-    if (!ok) {
-      return errors::FailedPrecondition(
-          "Failed to memcopy into scratch buffer for device ",
-          tf_gpu_id_.value());
-    }
-
     device_contexts_.push_back(new GPUDeviceContext(
         i, streams_.back()->compute, streams_.back()->host_to_device,
         streams_.back()->device_to_host, streams_.back()->device_to_device));
@@ -867,10 +877,11 @@ PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() {
   return new ConcretePerOpGpuDevice();
 }
 
-void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
-                                          PerOpGpuDevice* device,
-                                          DeviceContext* dc,
-                                          Allocator* allocator) {
+Status BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
+                                            PerOpGpuDevice* device,
+                                            DeviceContext* dc,
+                                            Allocator* allocator) {
+  TF_RETURN_IF_ERROR(InitScratchBuffers());
   if (dc) {
     const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
     const int stream_id = gpu_dc->stream_id();
@@ -881,6 +892,7 @@ void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
   } else {
     ReinitializeDevice(context, device, 0, allocator);
   }
+  return Status::OK();
 }
 
 Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 56d03d7a8c..b3eea55758 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -86,8 +86,9 @@ class BaseGPUDevice : public LocalDevice {
   // The caller owns the returned device.
   PerOpGpuDevice* MakeGpuDevice() override;
 
-  void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
-                             DeviceContext* dc, Allocator* allocator) override;
+  Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
+                               DeviceContext* dc,
+                               Allocator* allocator) override;
 
   // Returns the CUDA GPU id of this device within the native driver system;
   // e.g., for CUDA this is the ordinal of the GPU within the system.
@@ -125,6 +126,7 @@ class BaseGPUDevice : public LocalDevice {
   class StreamGroupFactory;
 
   gtl::InlinedVector<StreamGroup*, 4> streams_;
+  mutex scratch_init_mutex_;
   gtl::InlinedVector<char*, 4> scratch_;
   std::vector<GPUDeviceContext*> device_contexts_;
   GpuDeviceInfo* gpu_device_info_ = nullptr;
@@ -135,6 +137,9 @@ class BaseGPUDevice : public LocalDevice {
   std::unique_ptr<EventMgr> em_;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
 
+  // Initialize scractch buffers used by Eigen.
+  Status InitScratchBuffers();
+
   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
                           int stream_id, Allocator* allocator);
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index b18688174d..9ec740fabe 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -76,12 +76,16 @@ GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
 // This function is defined for debugging problems with the allocators.
 GPUProcessState::~GPUProcessState() {
   CHECK_EQ(this, instance_);
-  for (auto p : gpu_allocators_) {
-    delete p;
-  }
   instance_ = nullptr;
 }
 
+int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
+  // Return the NUMA node associated with the GPU's StreamExecutor.
+  se::StreamExecutor* se =
+      GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
+  return se->GetDeviceDescription().numa_node();
+}
+
 Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
                                             TfGpuId tf_gpu_id,
                                             size_t total_bytes) {
@@ -93,13 +97,10 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
 
   if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
     gpu_allocators_.resize(tf_gpu_id.value() + 1);
-    if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types)
-      gpu_al_.resize(tf_gpu_id.value() + 1);
   }
 
-  if (gpu_allocators_[tf_gpu_id.value()] == nullptr) {
-    VisitableAllocator* gpu_allocator;
-
+  AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
+  if (allocator_parts.allocator.get() == nullptr) {
     // Validate allocator types.
     if (!allocator_type.empty() && allocator_type != "BFC") {
       LOG(ERROR) << "Invalid allocator type: " << allocator_type;
@@ -108,8 +109,17 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
 
     CudaGpuId cuda_gpu_id;
     TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id));
-    gpu_allocator =
-        new GPUBFCAllocator(cuda_gpu_id, total_bytes, options,
+    int bus_id = BusIdForGPU(tf_gpu_id);
+    while (bus_id >= gpu_visitors_.size()) {
+      gpu_visitors_.push_back({});
+    }
+    GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+        GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+        (options.per_process_gpu_memory_fraction() > 1.0 ||
+         options.experimental().use_unified_memory()),
+        gpu_visitors_[bus_id], {});
+    Allocator* gpu_allocator =
+        new GPUBFCAllocator(sub_allocator, total_bytes, options,
                             strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
 
     // If true, checks for memory overwrites by writing
@@ -123,34 +133,25 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
       // **WARNING** probably will not work in a multi-gpu scenario
       gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, cuda_gpu_id);
     }
-    gpu_allocators_[tf_gpu_id.value()] = gpu_allocator;
-
-    // If there are any pending AllocVisitors for this bus, add
-    // them now.
-    se::StreamExecutor* se =
-        GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
-    int bus_id = se->GetDeviceDescription().numa_node();
-    if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
-      for (const auto& v : gpu_visitors_[bus_id]) {
-        gpu_allocator->AddAllocVisitor(v);
-      }
-    }
+
+    Allocator* recording_allocator = nullptr;
     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
       ProcessState::MemDesc md;
       md.loc = ProcessState::MemDesc::GPU;
       md.dev_index = cuda_gpu_id.value();
       md.gpu_registered = false;
       md.nic_registered = true;
-      if (static_cast<int64>(gpu_al_.size()) <= tf_gpu_id.value()) {
-        gpu_al_.resize(tf_gpu_id.value() + 1);
-      }
-      gpu_al_[tf_gpu_id.value()] = new internal::RecordingAllocator(
+      recording_allocator = new internal::RecordingAllocator(
           &process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
     }
+    allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator), sub_allocator,
+                       std::unique_ptr<Allocator>(recording_allocator)};
+  }
+  if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
+    return allocator_parts.recording_allocator.get();
+  } else {
+    return allocator_parts.allocator.get();
   }
-  if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types)
-    return gpu_al_[tf_gpu_id.value()];
-  return gpu_allocators_[tf_gpu_id.value()];
 #else
   LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda.";
   return nullptr;
@@ -172,11 +173,12 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
     tf_shared_lock lock(mu_);
 
     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
-        static_cast<int>(cuda_al_.size()) > 0) {
-      return cuda_al_[0];
+        !cuda_host_allocators_.empty() &&
+        cuda_host_allocators_[0].recording_allocator != nullptr) {
+      return cuda_host_allocators_[0].recording_allocator.get();
     }
     if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
-      return cuda_host_allocators_[0];
+      return cuda_host_allocators_[0].allocator.get();
     }
   }
 
@@ -190,7 +192,7 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
   // it knows is valid.
   se::StreamExecutor* se = nullptr;
   for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
-    if (gpu_allocators_[i] != nullptr) {
+    if (gpu_allocators_[i].allocator != nullptr) {
       se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
       break;
     }
@@ -199,6 +201,15 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
   CHECK_NE(nullptr, se);
 
   while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) {
+    while (cuda_host_alloc_visitors_.size() <= numa_node) {
+      cuda_host_alloc_visitors_.push_back({});
+    }
+    while (cuda_host_free_visitors_.size() <= numa_node) {
+      cuda_host_free_visitors_.push_back({});
+    }
+    SubAllocator* sub_allocator = new CUDAHostAllocator(
+        se, numa_node, cuda_host_alloc_visitors_[numa_node],
+        cuda_host_free_visitors_[numa_node]);
     // TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
     int64 cuda_host_mem_limit_in_mb = -1;
     Status status = ReadInt64FromEnvVar("TF_CUDA_HOST_MEM_LIMIT_IN_MB",
@@ -208,62 +219,92 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
       LOG(ERROR) << "GetCUDAHostAllocator: " << status.error_message();
     }
     int64 cuda_host_mem_limit = cuda_host_mem_limit_in_mb * (1LL << 20);
-    VisitableAllocator* allocator =
-        new BFCAllocator(new CUDAHostAllocator(se), cuda_host_mem_limit,
+    Allocator* allocator =
+        new BFCAllocator(sub_allocator, cuda_host_mem_limit,
                          true /*allow_growth*/, "cuda_host_bfc" /*name*/);
 
-    if (LogMemory::IsEnabled()) {
+    if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
       // Wrap the allocator to track allocation ids for better logging
       // at the cost of performance.
-      allocator = new TrackingVisitableAllocator(allocator, true);
+      allocator = new TrackingAllocator(allocator, true);
     }
-    cuda_host_allocators_.push_back(allocator);
+    cuda_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
+                                     sub_allocator,
+                                     std::unique_ptr<Allocator>(nullptr)});
+    AllocatorParts& allocator_parts = cuda_host_allocators_.back();
     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
       ProcessState::MemDesc md;
       md.loc = ProcessState::MemDesc::CPU;
       md.dev_index = 0;
       md.gpu_registered = true;
       md.nic_registered = false;
-      cuda_al_.push_back(new internal::RecordingAllocator(
-          &process_state_->mem_desc_map_, cuda_host_allocators_.back(), md,
-          &mu_));
+      allocator_parts.recording_allocator.reset(
+          new internal::RecordingAllocator(&process_state_->mem_desc_map_,
+                                           allocator_parts.allocator.get(), md,
+                                           &mu_));
     }
   }
-  if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types)
-    return cuda_al_[0];
-  return cuda_host_allocators_[0];
+  if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
+    return cuda_host_allocators_[0].recording_allocator.get();
+  } else {
+    return cuda_host_allocators_[0].allocator.get();
+  }
 }
 
 void GPUProcessState::AddGPUAllocVisitor(int bus_id,
-                                         const AllocVisitor& visitor) {
-  CHECK(process_state_);
+                                         const SubAllocator::Visitor& visitor) {
 #if GOOGLE_CUDA
   mutex_lock lock(mu_);
-  for (int i = 0; i < static_cast<int64>(gpu_allocators_.size()); ++i) {
-    se::StreamExecutor* se =
-        GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
-    if (gpu_allocators_[i] &&
-        (se->GetDeviceDescription().numa_node() + 1) == bus_id) {
-      gpu_allocators_[i]->AddAllocVisitor(visitor);
-    }
-  }
+  CHECK(gpu_allocators_.empty())  // Crash OK
+      << "AddGPUAllocVisitor must be called before "
+         "first call to GetGPUAllocator.";
   while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
-    gpu_visitors_.push_back(std::vector<AllocVisitor>());
+    gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
   }
   gpu_visitors_[bus_id].push_back(visitor);
 #endif  // GOOGLE_CUDA
 }
 
+void GPUProcessState::AddCUDAHostAllocVisitor(
+    int numa_node, const SubAllocator::Visitor& visitor) {
+#if GOOGLE_CUDA
+  mutex_lock lock(mu_);
+  CHECK(cuda_host_allocators_.empty())  // Crash OK
+      << "AddCUDAHostAllocVisitor must be called before "
+         "first call to GetCUDAHostAllocator.";
+  while (numa_node >= static_cast<int64>(cuda_host_alloc_visitors_.size())) {
+    cuda_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
+  }
+  cuda_host_alloc_visitors_[numa_node].push_back(visitor);
+#endif  // GOOGLE_CUDA
+}
+
+void GPUProcessState::AddCUDAHostFreeVisitor(
+    int numa_node, const SubAllocator::Visitor& visitor) {
+#if GOOGLE_CUDA
+  mutex_lock lock(mu_);
+  CHECK(cuda_host_allocators_.empty())  // Crash OK
+      << "AddCUDAHostFreeVisitor must be called before "
+         "first call to GetCUDAHostAllocator.";
+  while (numa_node >= static_cast<int64>(cuda_host_free_visitors_.size())) {
+    cuda_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
+  }
+  cuda_host_free_visitors_[numa_node].push_back(visitor);
+#endif  // GOOGLE_CUDA
+}
+
 void GPUProcessState::TestOnlyReset() {
-  process_state_->ProcessState::TestOnlyReset();
+  if (process_state_) {
+    process_state_->ProcessState::TestOnlyReset();
+  }
   {
     mutex_lock lock(mu_);
     gpu_device_enabled_ = false;
+    gpu_allocators_.clear();
     gpu_visitors_.clear();
-    gtl::STLDeleteElements(&gpu_allocators_);
-    gtl::STLDeleteElements(&cuda_host_allocators_);
-    gtl::STLDeleteElements(&gpu_al_);
-    gtl::STLDeleteElements(&cuda_al_);
+    cuda_host_allocators_.clear();
+    cuda_host_alloc_visitors_.clear();
+    cuda_host_free_visitors_.clear();
   }
 }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index cb41c3c6bd..43e9a31660 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -32,7 +32,6 @@ limitations under the License.
 namespace tensorflow {
 
 class Allocator;
-class VisitableAllocator;
 class PoolAllocator;
 
 // Singleton that manages per-process state when GPUs are present.
@@ -72,18 +71,30 @@ class GPUProcessState {
 
   virtual Allocator* GetCUDAHostAllocator(int numa_node);
 
-  // Registers a function to be called once on every new Region
-  // allocated by every GPURegionAllocator proximate to the specified
-  // bus.  The AllocVisitor is provided with a memory pointer and the
-  // size of the area it identifies.  The pointer is not guaranteed to
-  // be valid after the call terminates.  The intention is for this
-  // interface to be used for network device memory registration.
-  // "bus_id" is platform-specific.  On many platforms it
-  // should be 0.  On machines with multiple PCIe buses, it should be
-  // the index of one of the PCIe buses.  If the bus_id is invalid,
-  // results are undefined.
-  typedef std::function<void(void*, size_t)> AllocVisitor;
-  virtual void AddGPUAllocVisitor(int bus_id, const AllocVisitor& visitor);
+  // Registers a Visitor to be invoked on new chunks of memory allocated by the
+  // SubAllocator of every GPU proximate to the specified bus.  The AllocVisitor
+  // is provided with a memory pointer, a GPU id, and the size of the area it
+  // identifies.  The pointer is not guaranteed to be valid after the call
+  // terminates.  The intention is for this interface to be used for network
+  // device memory registration.  "bus_id" is platform-specific.  On many
+  // platforms it should be 0.  On machines with multiple PCIe buses, it should
+  // be the index of one of the PCIe buses (maybe the NUMA node at which the
+  // PCIe is rooted).  If the bus_id is invalid, results are undefined.
+  virtual void AddGPUAllocVisitor(int bus_id,
+                                  const SubAllocator::Visitor& visitor);
+
+  // Registers a Visitor to be invoked on new chunks of memory allocated by
+  // the SubAllocator of the CUDAHostAllocator for the given numa_node.
+  virtual void AddCUDAHostAllocVisitor(int numa_node,
+                                       const SubAllocator::Visitor& visitor);
+
+  // Registers a Visitor to be invoked on each chunk handed back for freeing to
+  // the SubAllocator of the CUDAHostAllocator for the given numa_node.
+  virtual void AddCUDAHostFreeVisitor(int numa_node,
+                                      const SubAllocator::Visitor& visitor);
+
+  // Returns bus_id for the given GPU id.
+  virtual int BusIdForGPU(TfGpuId tf_gpu_id);
 
  protected:
   GPUProcessState();
@@ -103,16 +114,21 @@ class GPUProcessState {
 
   mutex mu_;
 
-  std::vector<VisitableAllocator*> gpu_allocators_ GUARDED_BY(mu_);
-  std::vector<std::vector<AllocVisitor>> gpu_visitors_ GUARDED_BY(mu_);
-  std::vector<Allocator*> cuda_host_allocators_ GUARDED_BY(mu_);
+  struct AllocatorParts {
+    std::unique_ptr<Allocator> allocator;
+    SubAllocator* sub_allocator;  // owned by allocator
+    std::unique_ptr<Allocator> recording_allocator;
+  };
+  std::vector<AllocatorParts> gpu_allocators_ GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_ GUARDED_BY(mu_);
 
-  virtual ~GPUProcessState();
+  std::vector<AllocatorParts> cuda_host_allocators_ GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>> cuda_host_alloc_visitors_
+      GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_
+      GUARDED_BY(mu_);
 
-  // Optional RecordingAllocators that wrap the corresponding
-  // Allocators for runtime attribute use analysis.
-  std::vector<Allocator*> gpu_al_ GUARDED_BY(mu_);
-  std::vector<Allocator*> cuda_al_ GUARDED_BY(mu_);
+  virtual ~GPUProcessState();
 
   friend class GPUDeviceTest;
 };
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
index 583bff2c07..6b2f6547b0 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -31,7 +31,8 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) {
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
-              .ValueOrDie()),
+              .ValueOrDie(),
+          0 /*numa_node*/, {}, {}),
       new NoopRounder, "pool");
 
   EXPECT_EQ(nullptr, pool.AllocateRaw(4 /*alignment*/, 0 /*num_bytes*/));
@@ -49,7 +50,8 @@ TEST(PoolAllocatorTest, ZeroSizePool) {
       0 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
-              .ValueOrDie()),
+              .ValueOrDie(),
+          0 /*numa_node*/, {}, {}),
       new NoopRounder, "pool");
 
   EXPECT_EQ(0, pool.get_from_pool_count());
@@ -82,7 +84,8 @@ TEST(PoolAllocatorTest, Alignment) {
       0 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
-              .ValueOrDie()),
+              .ValueOrDie(),
+          0 /*numa_node*/, {}, {}),
       new NoopRounder, "pool");
   for (int i = 0; i < 16; ++i) {
     size_t alignment = 1 << i;
@@ -97,8 +100,8 @@ TEST(PoolAllocatorTest, Alignment) {
 
 TEST(PoolAllocatorTest, AutoResize) {
   PoolAllocator pool(2 /*pool_size_limit*/, true /*auto_resize*/,
-                     new BasicCPUAllocator(0 /*numa_node*/), new NoopRounder,
-                     "pool");
+                     new BasicCPUAllocator(0 /*numa_node*/, {}, {}),
+                     new NoopRounder, "pool");
 
   // Alloc/dealloc 10 sizes just a few times, confirming pool size
   // stays at 2.
@@ -123,14 +126,32 @@ TEST(PoolAllocatorTest, AutoResize) {
 }
 
 TEST(PoolAllocatorTest, CudaHostAllocator) {
+  int alloc_count = 0;
+  int64 alloc_size = 0;
+  SubAllocator::Visitor alloc_visitor =
+      [&alloc_count, &alloc_size](void* ptr, int numa_node, int64 size) {
+        ++alloc_count;
+        alloc_size += size;
+      };
+  int free_count = 0;
+  int64 free_size = 0;
+  SubAllocator::Visitor free_visitor =
+      [&free_count, &free_size](void* ptr, int numa_node, int64 size) {
+        ++free_count;
+        free_size += size;
+      };
   se::Platform* platform =
       se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
-  PoolAllocator pool(
-      2 /*pool_size_limit*/, false /*auto_resize*/,
-      new CUDAHostAllocator(
-          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
-              .ValueOrDie()),
-      new NoopRounder, "pool");
+  CUDAHostAllocator* sub_allocator = new CUDAHostAllocator(
+      platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
+          .ValueOrDie(),
+      0 /*numa_node*/, {alloc_visitor}, {free_visitor});
+  PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/,
+                     sub_allocator, new NoopRounder, "pool");
+  EXPECT_EQ(0, alloc_count);
+  EXPECT_EQ(0, alloc_size);
+  EXPECT_EQ(0, free_count);
+  EXPECT_EQ(0, free_size);
 
   // Repeatedly Get a 16-byte value, confirming that there's only
   // one real allocation.
@@ -138,6 +159,10 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
   EXPECT_EQ(0, pool.get_from_pool_count());
   EXPECT_EQ(1, pool.allocated_count());
   EXPECT_NE(nullptr, p1_16);
+  EXPECT_EQ(1, alloc_count);  // Underlying suballoc of 16 bytes
+  // Each suballocation includes a 16B ChunkPrefix.
+  static const int kChunkPrefixSize = 16;
+  EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size);
   pool.DeallocateRaw(p1_16);
   // Pool contents {16}
   EXPECT_EQ(1, pool.put_count());
@@ -148,6 +173,9 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
   pool.DeallocateRaw(p2_16);  // Put it back.
   // Pool contents {16}
   EXPECT_EQ(2, pool.put_count());
+  EXPECT_EQ(1, alloc_count);  // Underlying suballoc of 16 bytes
+  EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size);
+  EXPECT_EQ(0, free_count);
 
   // Get two more values of different sizes.
   void* p3_4 = pool.AllocateRaw(4, 4);
@@ -160,6 +188,9 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
   void* p4_2 = pool.AllocateRaw(4, 2);  // Get a third size buffer.
   EXPECT_NE(nullptr, p4_2);
   EXPECT_EQ(0, pool.evicted_count());
+  EXPECT_EQ(3, alloc_count);
+  EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
+  EXPECT_EQ(0, free_count);
 
   // The pool is full: when we put back p4_2, the 16-byte buffer
   // should be evicted since it was least recently inserted.
@@ -167,6 +198,10 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
   // Pool contents {2, 4}
   EXPECT_EQ(4, pool.put_count());
   EXPECT_EQ(1, pool.evicted_count());
+  EXPECT_EQ(3, alloc_count);
+  EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
+  EXPECT_EQ(1, free_count);
+  EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size);
 
   // Re-getting and putting size 2 or 4 should not alter pool size or
   // num-evicted.
@@ -180,12 +215,20 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
   EXPECT_EQ(6, pool.put_count());
   EXPECT_EQ(3, pool.allocated_count());
   EXPECT_EQ(1, pool.evicted_count());
+  EXPECT_EQ(3, alloc_count);
+  EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
+  EXPECT_EQ(1, free_count);
+  EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size);
 
   pool.Clear();
   EXPECT_EQ(0, pool.get_from_pool_count());
   EXPECT_EQ(0, pool.put_count());
   EXPECT_EQ(0, pool.allocated_count());
   EXPECT_EQ(0, pool.evicted_count());
+  EXPECT_EQ(3, alloc_count);
+  EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
+  EXPECT_EQ(3, free_count);
+  EXPECT_EQ(16 + 4 + 2 + (free_count * kChunkPrefixSize), free_size);
 }
 
 TEST(PoolAllocatorTest, Pow2Rounder) {
@@ -206,7 +249,8 @@ TEST(PoolAllocatorTest, Name) {
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
           platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
-              .ValueOrDie()),
+              .ValueOrDie(),
+          0 /*numa_node*/, {}, {}),
       new NoopRounder, "pool");
   EXPECT_EQ("pool", pool.Name());
 }
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index df9c3a686c..538a70668a 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -23,12 +23,11 @@ limitations under the License.
 
 #include <cstdlib>
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
-#include "tensorflow/core/framework/allocator_registry.h"
+#include "tensorflow/core/common_runtime/pool_allocator.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 
 #ifndef INTEL_MKL_DNN_ONLY
 #include "i_malloc.h"
@@ -40,20 +39,16 @@ typedef unsigned int uint;
 
 namespace tensorflow {
 
-class MklSubAllocator : public SubAllocator {
+class MklSubAllocator : public BasicCPUAllocator {
  public:
+  MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {}
   ~MklSubAllocator() override {}
-
-  void* Alloc(size_t alignment, size_t num_bytes) override {
-    return port::AlignedMalloc(num_bytes, alignment);
-  }
-  void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
 };
 
 // CPU allocator that handles small-size allocations by calling
 // suballocator directly. Mostly, it is just a wrapper around a suballocator
 // (that calls malloc and free directly) with support for bookkeeping.
-class MklSmallSizeAllocator : public VisitableAllocator {
+class MklSmallSizeAllocator : public Allocator {
  public:
   MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory,
                         const string& name)
@@ -75,10 +70,6 @@ class MklSmallSizeAllocator : public VisitableAllocator {
       CHECK(map_.insert(map_val).second);
       // Increment statistics for small-size allocations.
       IncrementStats(num_bytes);
-      // Call alloc visitors.
-      for (const auto& visitor : alloc_visitors_) {
-        visitor(ptr, num_bytes);
-      }
     }
     return ptr;
   }
@@ -94,9 +85,6 @@ class MklSmallSizeAllocator : public VisitableAllocator {
     if (map_iter != map_.end()) {
       // Call free visitors.
       size_t dealloc_bytes = map_iter->second;
-      for (const auto& visitor : free_visitors_) {
-        visitor(ptr, dealloc_bytes);
-      }
       sub_allocator_->Free(ptr, dealloc_bytes);
       DecrementStats(dealloc_bytes);
       map_.erase(map_iter);
@@ -121,16 +109,6 @@ class MklSmallSizeAllocator : public VisitableAllocator {
     stats_.Clear();
   }
 
-  void AddAllocVisitor(Visitor visitor) override {
-    mutex_lock l(mutex_);
-    alloc_visitors_.push_back(visitor);
-  }
-
-  void AddFreeVisitor(Visitor visitor) override {
-    mutex_lock l(mutex_);
-    free_visitors_.push_back(visitor);
-  }
-
  private:
   // Increment statistics for the allocator handling small allocations.
   inline void IncrementStats(size_t alloc_size)
@@ -163,15 +141,11 @@ class MklSmallSizeAllocator : public VisitableAllocator {
 
   // Allocator stats for small allocs
   AllocatorStats stats_ GUARDED_BY(mutex_);
-
-  // Visitors
-  std::vector<Visitor> alloc_visitors_ GUARDED_BY(mutex_);
-  std::vector<Visitor> free_visitors_ GUARDED_BY(mutex_);
 };
 
 /// CPU allocator for MKL that wraps BFC allocator and intercepts
 /// and redirects memory allocation calls from MKL.
-class MklCPUAllocator : public VisitableAllocator {
+class MklCPUAllocator : public Allocator {
  public:
   // Constructor and other standard functions
 
@@ -284,16 +258,6 @@ class MklCPUAllocator : public VisitableAllocator {
     large_size_allocator_->ClearStats();
   }
 
-  void AddAllocVisitor(Visitor visitor) override {
-    small_size_allocator_->AddAllocVisitor(visitor);
-    large_size_allocator_->AddAllocVisitor(visitor);
-  }
-
-  void AddFreeVisitor(Visitor visitor) override {
-    small_size_allocator_->AddFreeVisitor(visitor);
-    large_size_allocator_->AddFreeVisitor(visitor);
-  }
-
  private:
   // Hooks provided by this allocator for memory allocation routines from MKL
 
@@ -330,7 +294,7 @@ class MklCPUAllocator : public VisitableAllocator {
   // The alignment that we need for the allocations
   static constexpr const size_t kAlignment = 64;
 
-  VisitableAllocator* large_size_allocator_;     // owned by this class
+  Allocator* large_size_allocator_;              // owned by this class
   MklSmallSizeAllocator* small_size_allocator_;  // owned by this class.
 
   SubAllocator* sub_allocator_;  // not owned by this class
diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc
index fdad8de8d6..66dc8f3322 100644
--- a/tensorflow/core/common_runtime/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/pool_allocator.cc
@@ -40,8 +40,7 @@ PoolAllocator::PoolAllocator(size_t pool_size_limit, bool auto_resize,
       auto_resize_(auto_resize),
       pool_size_limit_(pool_size_limit),
       allocator_(allocator),
-      size_rounder_(size_rounder),
-      allocation_begun_(false) {
+      size_rounder_(size_rounder) {
   if (auto_resize) {
     CHECK_LT(size_t{0}, pool_size_limit)
         << "size limit must be > 0 if auto_resize is true.";
@@ -93,7 +92,6 @@ ChunkPrefix* FindPrefix(void* user_ptr) {
 }  // namespace
 
 void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
-  if (!allocation_begun_) allocation_begun_ = true;
   if (num_bytes == 0) return nullptr;
 
   // If alignment is larger than kPoolAlignment, increase num_bytes so that we
@@ -129,9 +127,6 @@ void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
     return PrepareChunk(r, alignment, num_bytes);
   } else {
     void* ptr = allocator_->Alloc(kPoolAlignment, num_bytes);
-    for (const auto& v : alloc_visitors_) {
-      v(ptr, num_bytes);
-    }
     return PrepareChunk(ptr, alignment, num_bytes);
   }
 }
@@ -141,9 +136,6 @@ void PoolAllocator::DeallocateRaw(void* ptr) {
   ChunkPrefix* cp = FindPrefix(ptr);
   CHECK_LE((void*)cp, (void*)ptr);
   if (!has_size_limit_ && !auto_resize_) {
-    for (const auto& v : free_visitors_) {
-      v(cp, cp->num_bytes);
-    }
     allocator_->Free(cp, cp->num_bytes);
   } else {
     mutex_lock lock(mutex_);
@@ -164,9 +156,6 @@ void PoolAllocator::Clear() {
     mutex_lock lock(mutex_);
     for (auto iter : pool_) {
       PtrRecord* pr = iter.second;
-      for (const auto& v : free_visitors_) {
-        v(pr->ptr, pr->num_bytes);
-      }
       allocator_->Free(pr->ptr, pr->num_bytes);
       delete pr;
     }
@@ -221,9 +210,6 @@ void PoolAllocator::EvictOne() {
     DCHECK(iter != pool_.end());
   }
   pool_.erase(iter);
-  for (const auto& v : free_visitors_) {
-    v(prec->ptr, prec->num_bytes);
-  }
   allocator_->Free(prec->ptr, prec->num_bytes);
   delete prec;
   ++evicted_count_;
@@ -269,28 +255,19 @@ void PoolAllocator::EvictOne() {
   }
 }
 
-void PoolAllocator::AddAllocVisitor(Visitor visitor) {
-  mutex_lock lock(mutex_);
-  CHECK(!allocation_begun_)
-      << "AddAllocVisitor may not be called after pool allocation "
-      << "has begun.";
-  alloc_visitors_.push_back(visitor);
-}
-
-void PoolAllocator::AddFreeVisitor(Visitor visitor) {
-  mutex_lock lock(mutex_);
-  CHECK(!allocation_begun_)
-      << "AddFreeVisitor may not be called after pool allocation "
-      << "has begun.";
-  free_visitors_.push_back(visitor);
-}
-
 void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) {
-  return port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+  void* ptr = nullptr;
+  if (num_bytes > 0) {
+    ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+    VisitAlloc(ptr, numa_node_, num_bytes);
+  }
+  return ptr;
 }
 
 void BasicCPUAllocator::Free(void* ptr, size_t num_bytes) {
-  port::AlignedFree(ptr);
+  if (num_bytes > 0) {
+    VisitFree(ptr, numa_node_, num_bytes);
+    port::AlignedFree(ptr);
+  }
 }
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 607734445b..5b4623ba10 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -16,14 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_
 
-// Simple LRU pool allocators for various flavors of CPU RAM that
-// implement the VisitableAllocator interface.
+// Simple LRU pool allocators for various flavors of CPU RAM.
 
 #include <atomic>
 #include <map>
 #include <memory>
 #include <vector>
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -41,7 +40,7 @@ class RoundUpInterface {
 
 // Size-limited pool of memory buffers obtained from a SubAllocator
 // instance.  Pool eviction policy is LRU.
-class PoolAllocator : public VisitableAllocator {
+class PoolAllocator : public Allocator {
  public:
   // "pool_size_limit" is the maximum number of returned, re-usable
   // memory buffers to keep in the pool.  If pool_size_limit == 0, the
@@ -64,14 +63,6 @@ class PoolAllocator : public VisitableAllocator {
 
   void DeallocateRaw(void* ptr) override;
 
-  // REQUIRES: The following functions may only be called prior
-  // to the first Allocate*() call.  Once allocation has begun, it is
-  // illegal to register another visitor.
-
-  void AddAllocVisitor(Visitor visitor) override;
-
-  void AddFreeVisitor(Visitor visitor) override;
-
   // Allocate an unused memory region of size "num_bytes".  Fetch from
   // the pool if available, otherwise call allocator_.
   void* Get(size_t num_bytes);
@@ -141,12 +132,6 @@ class PoolAllocator : public VisitableAllocator {
   int64 put_count_ GUARDED_BY(mutex_) = 0;
   int64 allocated_count_ GUARDED_BY(mutex_) = 0;
   int64 evicted_count_ GUARDED_BY(mutex_) = 0;
-  // Write access to these is guarded by mutex_, but not read
-  // access. They may only be modified prior to the first
-  // allocation.  Later attempts to modify will fail.
-  std::vector<Visitor> alloc_visitors_;
-  std::vector<Visitor> free_visitors_;
-  std::atomic<bool> allocation_begun_;
 };
 
 // Do-nothing rounder. Passes through sizes unchanged.
@@ -166,7 +151,9 @@ class Pow2Rounder : public RoundUpInterface {
 class BasicCPUAllocator : public SubAllocator {
  public:
   // Argument numa_node is currently ignored.
-  explicit BasicCPUAllocator(int numa_node) : numa_node_(numa_node) {}
+  BasicCPUAllocator(int numa_node, const std::vector<Visitor>& alloc_visitors,
+                    const std::vector<Visitor>& free_visitors)
+      : SubAllocator(alloc_visitors, free_visitors), numa_node_(numa_node) {}
 
   ~BasicCPUAllocator() override {}
 
@@ -176,6 +163,8 @@ class BasicCPUAllocator : public SubAllocator {
 
  private:
   int numa_node_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(BasicCPUAllocator);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index 447338e7bd..bcaa37fc8a 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -71,20 +71,28 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
   return MemDesc();
 }
 
-VisitableAllocator* ProcessState::GetCPUAllocator(int numa_node) {
+Allocator* ProcessState::GetCPUAllocator(int numa_node) {
   CHECK_GE(numa_node, 0);
   if (!numa_enabled_) numa_node = 0;
   mutex_lock lock(mu_);
   while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
+    // If visitors have been defined we need an Allocator built from
+    // a SubAllocator.  Prefer BFCAllocator, but fall back to PoolAllocator
+    // depending on env var setting.
+    const bool alloc_visitors_defined =
+        (!cpu_alloc_visitors_.empty() || !cpu_free_visitors_.empty());
     bool use_bfc_allocator = false;
-    // TODO(reedwm): Switch default to BGFAllocator if it's at least as fast and
-    // efficient.
-    Status status = ReadBoolFromEnvVar("TF_CPU_ALLOCATOR_USE_BFC", false,
-                                       &use_bfc_allocator);
+    Status status = ReadBoolFromEnvVar(
+        "TF_CPU_ALLOCATOR_USE_BFC", alloc_visitors_defined, &use_bfc_allocator);
     if (!status.ok()) {
       LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
     }
-    VisitableAllocator* allocator;
+    Allocator* allocator = nullptr;
+    SubAllocator* sub_allocator =
+        (alloc_visitors_defined || use_bfc_allocator)
+            ? new BasicCPUAllocator(numa_enabled_ ? numa_node : -1,
+                                    cpu_alloc_visitors_, cpu_free_visitors_)
+            : nullptr;
     if (use_bfc_allocator) {
       // TODO(reedwm): evaluate whether 64GB by default is the best choice.
       int64 cpu_mem_limit_in_mb = -1;
@@ -95,34 +103,63 @@ VisitableAllocator* ProcessState::GetCPUAllocator(int numa_node) {
         LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
       }
       int64 cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20);
-      allocator = new BFCAllocator(
-          new BasicCPUAllocator(numa_enabled_ ? numa_node : -1), cpu_mem_limit,
-          true /*allow_growth*/, "bfc_cpu_allocator_for_gpu" /*name*/);
+      DCHECK(sub_allocator);
+      allocator =
+          new BFCAllocator(sub_allocator, cpu_mem_limit, true /*allow_growth*/,
+                           "bfc_cpu_allocator_for_gpu" /*name*/);
       VLOG(2) << "Using BFCAllocator with memory limit of "
               << cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator";
-    } else {
-      allocator = new PoolAllocator(
-          100 /*pool_size_limit*/, true /*auto_resize*/,
-          new BasicCPUAllocator(numa_enabled_ ? numa_node : -1),
-          new NoopRounder, "cpu_pool");
+    } else if (alloc_visitors_defined) {
+      DCHECK(sub_allocator);
+      allocator =
+          new PoolAllocator(100 /*pool_size_limit*/, true /*auto_resize*/,
+                            sub_allocator, new NoopRounder, "cpu_pool");
       VLOG(2) << "Using PoolAllocator for ProcessState CPU allocator "
               << "numa_enabled_=" << numa_enabled_
               << " numa_node=" << numa_node;
+    } else {
+      DCHECK(!sub_allocator);
+      allocator = cpu_allocator();
     }
-    if (LogMemory::IsEnabled()) {
+    if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
       // Wrap the allocator to track allocation ids for better logging
       // at the cost of performance.
-      allocator = new TrackingVisitableAllocator(allocator, true);
+      allocator = new TrackingAllocator(allocator, true);
     }
     cpu_allocators_.push_back(allocator);
+    if (!sub_allocator) {
+      DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty());
+    }
   }
   return cpu_allocators_[numa_node];
 }
 
+void ProcessState::AddCPUAllocVisitor(SubAllocator::Visitor visitor) {
+  VLOG(1) << "AddCPUAllocVisitor";
+  mutex_lock lock(mu_);
+  CHECK_EQ(0, cpu_allocators_.size())  // Crash OK
+      << "AddCPUAllocVisitor must be called prior to first call to "
+         "ProcessState::GetCPUAllocator";
+  cpu_alloc_visitors_.push_back(std::move(visitor));
+}
+
+void ProcessState::AddCPUFreeVisitor(SubAllocator::Visitor visitor) {
+  mutex_lock lock(mu_);
+  CHECK_EQ(0, cpu_allocators_.size())  // Crash OK
+      << "AddCPUFreeVisitor must be called prior to first call to "
+         "ProcessState::GetCPUAllocator";
+  cpu_free_visitors_.push_back(std::move(visitor));
+}
+
 void ProcessState::TestOnlyReset() {
   mutex_lock lock(mu_);
+  // Don't delete this value because it's static.
+  Allocator* default_cpu_allocator = cpu_allocator();
   mem_desc_map_.clear();
-  gtl::STLDeleteElements(&cpu_allocators_);
+  for (Allocator* a : cpu_allocators_) {
+    if (a != default_cpu_allocator) delete a;
+  }
+  cpu_allocators_.clear();
   gtl::STLDeleteElements(&cpu_al_);
 }
 
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index 2892677333..cac312d849 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -30,7 +30,6 @@ limitations under the License.
 namespace tensorflow {
 
 class Allocator;
-class VisitableAllocator;
 class PoolAllocator;
 
 // Singleton that manages per-process state, e.g. allocation of
@@ -65,7 +64,15 @@ class ProcessState {
 
   // Returns the one CPUAllocator used for the given numa_node.
   // TEMPORARY: ignores numa_node.
-  VisitableAllocator* GetCPUAllocator(int numa_node);
+  Allocator* GetCPUAllocator(int numa_node);
+
+  // Registers alloc visitor for the CPU allocator(s).
+  // REQUIRES: must be called before GetCPUAllocator.
+  void AddCPUAllocVisitor(SubAllocator::Visitor v);
+
+  // Registers free visitor for the CPU allocator(s).
+  // REQUIRES: must be called before GetCPUAllocator.
+  void AddCPUFreeVisitor(SubAllocator::Visitor v);
 
   typedef std::unordered_map<const void*, MemDesc> MDMap;
 
@@ -87,7 +94,9 @@ class ProcessState {
 
   mutex mu_;
 
-  std::vector<VisitableAllocator*> cpu_allocators_ GUARDED_BY(mu_);
+  std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_);
+  std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_);
+  std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_);
 
   virtual ~ProcessState();
 
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index 103eee03b3..9d59264899 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -72,9 +72,10 @@ class RenamedDevice : public Device {
     return underlying_->MakeGpuDevice();
   }
 
-  void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
-                             DeviceContext* dc, Allocator* allocator) override {
-    underlying_->ReinitializeGpuDevice(context, device, dc, allocator);
+  Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
+                               DeviceContext* dc,
+                               Allocator* allocator) override {
+    return underlying_->ReinitializeGpuDevice(context, device, dc, allocator);
   }
 
   Status MakeTensorFromProto(const TensorProto& tensor_proto,
diff --git a/tensorflow/core/common_runtime/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h
deleted file mode 100644
index ae0563a96a..0000000000
--- a/tensorflow/core/common_runtime/visitable_allocator.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
-
-#include <functional>
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/tracking_allocator.h"
-
-namespace tensorflow {
-
-// Subclass VisitableAllocator instead of Allocator when a memory
-// allocator needs to enable some kind of registration/deregistration
-// of memory areas.
-class VisitableAllocator : public Allocator {
- public:
-  // Visitor gets called with a pointer to a memory area and its
-  // size in bytes.
-  typedef std::function<void(void*, size_t)> Visitor;
-
-  // Register a visitor guaranteed to be called exactly once on each
-  // chunk of memory newly allocated from the underlying device.
-  // Typically, chunks will be reused and possibly sub-divided by a
-  // pool manager, so the calls will happen only once per process
-  // execution, not once per tensor (re)allocation.
-  virtual void AddAllocVisitor(Visitor visitor) = 0;
-
-  // Register a visitor guaranteed to be called on each chunk of
-  // memory returned to the underlying device.
-  virtual void AddFreeVisitor(Visitor visitor) = 0;
-};
-
-// Needed for cases when a VisitableAllocator gets wrapped for tracking.
-// Multiple-inheritance is considered acceptable in this case because
-// VisitableAllocator is a pure virtual interface and only TrackingAllocator
-// has default implementation.
-class TrackingVisitableAllocator : public TrackingAllocator,
-                                   public VisitableAllocator {
- public:
-  TrackingVisitableAllocator(VisitableAllocator* allocator, bool track_ids)
-      : TrackingAllocator(allocator, track_ids), allocator_(allocator) {}
-  ~TrackingVisitableAllocator() override {}
-
-  string Name() override { return TrackingAllocator::Name(); }
-
-  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    return TrackingAllocator::AllocateRaw(alignment, num_bytes);
-  }
-
-  void DeallocateRaw(void* ptr) override {
-    TrackingAllocator::DeallocateRaw(ptr);
-  }
-
-  void AddAllocVisitor(Visitor visitor) override {
-    allocator_->AddAllocVisitor(visitor);
-  }
-
-  void AddFreeVisitor(Visitor visitor) override {
-    allocator_->AddFreeVisitor(visitor);
-  }
-
- protected:
-  VisitableAllocator* allocator_;
-};
-}  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 2a7ee16a16..84cee5569c 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -196,7 +196,7 @@ class CPUAllocatorFactory : public AllocatorFactory {
   class CPUSubAllocator : public SubAllocator {
    public:
     explicit CPUSubAllocator(CPUAllocator* cpu_allocator)
-        : cpu_allocator_(cpu_allocator) {}
+        : SubAllocator({}, {}), cpu_allocator_(cpu_allocator) {}
 
     void* Alloc(size_t alignment, size_t num_bytes) override {
       return cpu_allocator_->AllocateRaw(alignment, num_bytes);
@@ -222,4 +222,22 @@ Allocator* cpu_allocator() {
   }
   return cpu_alloc;
 }
+
+SubAllocator::SubAllocator(const std::vector<Visitor>& alloc_visitors,
+                           const std::vector<Visitor>& free_visitors)
+    : alloc_visitors_(alloc_visitors), free_visitors_(free_visitors) {}
+
+void SubAllocator::VisitAlloc(void* ptr, int index, size_t num_bytes) {
+  for (const auto& v : alloc_visitors_) {
+    v(ptr, index, num_bytes);
+  }
+}
+
+void SubAllocator::VisitFree(void* ptr, int index, size_t num_bytes) {
+  // Although we don't guarantee any order of visitor application, strive
+  // to apply free visitors in reverse order of alloc visitors.
+  for (int i = free_visitors_.size() - 1; i >= 0; --i) {
+    free_visitors_[i](ptr, index, num_bytes);
+  }
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index ded120b704..8c23604625 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -387,13 +388,36 @@ void EnableCPUAllocatorStats(bool enable);
 // full statistics. By default, it's disabled.
 void EnableCPUAllocatorFullStats(bool enable);
 
-// Abstract interface of an object that does the underlying suballoc/free of
-// memory for a higher-level allocator.
+// An object that does the underlying suballoc/free of memory for a higher-level
+// allocator.  The expectation is that the higher-level allocator is doing some
+// kind of cache or pool management so that it will call SubAllocator::Alloc and
+// Free relatively infrequently, compared to the number of times its own
+// AllocateRaw and Free methods are called.
 class SubAllocator {
  public:
+  // Visitor gets called with a pointer to a memory area and its
+  // size in bytes.  The index value will be numa_node for a CPU
+  // allocator and GPU id for a GPU allocator.
+  typedef std::function<void(void*, int index, size_t)> Visitor;
+
+  SubAllocator(const std::vector<Visitor>& alloc_visitors,
+               const std::vector<Visitor>& free_visitors);
+
   virtual ~SubAllocator() {}
   virtual void* Alloc(size_t alignment, size_t num_bytes) = 0;
   virtual void Free(void* ptr, size_t num_bytes) = 0;
+
+ protected:
+  // Implementation of Alloc() method must call this on newly allocated
+  // value.
+  void VisitAlloc(void* ptr, int index, size_t num_bytes);
+
+  // Implementation of Free() method must call this on value to be
+  // freed immediately before deallocation.
+  void VisitFree(void* ptr, int index, size_t num_bytes);
+
+  const std::vector<Visitor> alloc_visitors_;
+  const std::vector<Visitor> free_visitors_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 794250a2c1..53ac639b4c 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -214,10 +214,12 @@ class DeviceBase {
 
   // This is overridden by GPU devices to reinitialize the derived
   // type returned by MakeGpuDevice.
-  virtual void ReinitializeGpuDevice(OpKernelContext* /*context*/,
-                                     PerOpGpuDevice* /*device*/,
-                                     DeviceContext* /*dc*/,
-                                     Allocator* /*allocator*/) {}
+  virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/,
+                                       PerOpGpuDevice* /*device*/,
+                                       DeviceContext* /*dc*/,
+                                       Allocator* /*allocator*/) {
+    return Status::OK();
+  }
 
   // Unimplemented by default
   virtual const DeviceAttributes& attributes() const;
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 80f2b12987..3e34bf0418 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -265,9 +265,12 @@ OpKernelContext::OpKernelContext(Params* params, int num_outputs)
   params_->ensure_eigen_gpu_device();
   if (params_->eigen_gpu_device != nullptr) {
     Allocator* eigen_gpu_allocator = get_allocator(AllocatorAttributes());
-    params_->device->ReinitializeGpuDevice(this, params_->eigen_gpu_device,
-                                           params_->op_device_context,
-                                           eigen_gpu_allocator);
+    Status s = params_->device->ReinitializeGpuDevice(
+        this, params_->eigen_gpu_device, params_->op_device_context,
+        eigen_gpu_allocator);
+    if (!s.ok()) {
+      SetStatus(s);
+    }
   }
   if (params_->record_tensor_accesses) {
     referenced_tensors_.Init();