aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow')
-rw-r--r--tensorflow/contrib/gdr/gdr_memory_manager.cc102
-rw-r--r--tensorflow/contrib/verbs/rdma_mgr.cc81
-rw-r--r--tensorflow/contrib/verbs/rdma_mgr.h1
-rw-r--r--tensorflow/contrib/verbs/verbs_server_lib.cc5
-rw-r--r--tensorflow/core/BUILD1
-rw-r--r--tensorflow/core/common_runtime/bfc_allocator.cc21
-rw-r--r--tensorflow/core/common_runtime/bfc_allocator.h14
-rw-r--r--tensorflow/core/common_runtime/gpu/cuda_host_allocator.h12
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc17
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h44
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc90
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc10
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h11
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc20
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h20
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc35
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.cc64
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.h9
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_process_state.cc161
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_process_state.h58
-rw-r--r--tensorflow/core/common_runtime/gpu/pool_allocator_test.cc68
-rw-r--r--tensorflow/core/common_runtime/mkl_cpu_allocator.h50
-rw-r--r--tensorflow/core/common_runtime/pool_allocator.cc45
-rw-r--r--tensorflow/core/common_runtime/pool_allocator.h27
-rw-r--r--tensorflow/core/common_runtime/process_state.cc71
-rw-r--r--tensorflow/core/common_runtime/process_state.h15
-rw-r--r--tensorflow/core/common_runtime/renamed_device.h7
-rw-r--r--tensorflow/core/common_runtime/visitable_allocator.h79
-rw-r--r--tensorflow/core/framework/allocator.cc20
-rw-r--r--tensorflow/core/framework/allocator.h28
-rw-r--r--tensorflow/core/framework/device_base.h10
-rw-r--r--tensorflow/core/framework/op_kernel.cc9
32 files changed, 628 insertions, 577 deletions
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index 726f74c7b7..bb06f1c41c 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -138,6 +138,8 @@ class GdrMemoryManager : public RemoteMemoryManager {
Device* device, DeviceContext* device_context, bool on_host,
StatusCallback done) override;
+ static void RegMemVisitors();
+
protected:
Status CreateEndpoint(const string& host, const string& port,
RdmaEndpointPtr& endpoint);
@@ -183,35 +185,51 @@ class GdrMemoryManager : public RemoteMemoryManager {
TF_DISALLOW_COPY_AND_ASSIGN(GdrMemoryManager);
};
-// TODO(byronyi): remove this class and its registration when the default
-// cpu_allocator() returns visitable allocator, or cpu_allocator() is no
-// longer in use.
-class BFCGdrAllocator : public BFCAllocator {
- public:
- BFCGdrAllocator()
- : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36,
- true, "cpu_gdr_bfc") {}
-};
-class BFCGdrAllocatorFactory : public AllocatorFactory {
- public:
- Allocator* CreateAllocator() override { return new BFCGdrAllocator; }
-
- virtual SubAllocator* CreateSubAllocator(int numa_node) {
- return new BasicCPUAllocator(numa_node);
- }
-};
-
-REGISTER_MEM_ALLOCATOR("BFCGdrAllocator", 102, BFCGdrAllocatorFactory);
-
GdrMemoryManager::GdrMemoryManager(const string& host, const string& port)
: host_(host),
port_(port),
listening_(nullptr, EndpointDeleter),
stopped_(true),
- next_key_(0) {}
+ next_key_(0) {
+ static std::once_flag flag;
+ std::call_once(flag, []() { RegMemVisitors(); });
+}
GdrMemoryManager::~GdrMemoryManager() { close(epfd_); }
+/*static*/ void GdrMemoryManager::RegMemVisitors() {
+ SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node,
+ size_t num_bytes) {
+ GdrMemoryManager::Singleton().InsertMemoryRegion(
+ ptr, num_bytes, strings::StrCat("CPU:", numa_node));
+ };
+ SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node,
+ size_t num_bytes) {
+ GdrMemoryManager::Singleton().EvictMemoryRegion(ptr, num_bytes);
+ };
+ ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
+ ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
+
+#if GOOGLE_CUDA
+ if (IsGDRAvailable()) {
+ int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1;
+
+ // Note we don't free allocated GPU memory so there is no free visitor
+ SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id,
+ size_t num_bytes) {
+ RdmaMemoryMgr::Singleton().InsertMemoryRegion(
+ ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
+ };
+ GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
+ cuda_alloc_visitor);
+ GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id,
+ alloc_visitor);
+ GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor);
+ LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
+ }
+#endif // GOOGLE_CUDA
+}
+
Status GdrMemoryManager::Init() {
epfd_ = epoll_create1(0);
if (epfd_ == -1) {
@@ -271,48 +289,6 @@ Status GdrMemoryManager::Init() {
"cannot add server to epoll");
}
- Allocator* allocators[] = {
-#if GOOGLE_CUDA
- GPUProcessState::singleton()->GetCUDAHostAllocator(0),
-#endif // GOOGLE_CUDA
- ProcessState::singleton()->GetCPUAllocator(0),
- cpu_allocator(),
- };
-
- using namespace std::placeholders;
- VisitableAllocator::Visitor alloc_visitor =
- std::bind(&GdrMemoryManager::InsertMemoryRegion, this, _1, _2);
- VisitableAllocator::Visitor free_visitor =
- std::bind(&GdrMemoryManager::EvictMemoryRegion, this, _1, _2);
-
- std::set<Allocator*> instrumented_;
-
- // Host memory allocators
- for (Allocator* allocator : allocators) {
- auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
- CHECK(visitable_allocator)
- << "is not visitable for instrumentation" << allocator->Name();
- // Make sure we don't instrument the same allocator twice
- if (instrumented_.find(allocator) == std::end(instrumented_)) {
- visitable_allocator->AddAllocVisitor(alloc_visitor);
- visitable_allocator->AddFreeVisitor(free_visitor);
- instrumented_.insert(allocator);
- LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name();
- }
- }
-
-#if GOOGLE_CUDA
- VisitableAllocator::Visitor cuda_alloc_visitor =
- std::bind(&GdrMemoryManager::InsertMemoryRegion, this, _1, _2);
- if (IsGDRAvailable()) {
- // Note we don't free allocated GPU memory so there is no free visitor
- int32_t bus_id = TryToReadNumaNode(listening_->verbs->device) + 1;
- GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
- cuda_alloc_visitor);
- LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
- }
-#endif // GOOGLE_CUDA
-
return Status::OK();
}
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 3cb5e61fac..2784bf124c 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -20,7 +20,6 @@ limitations under the License.
#include <vector>
#include "tensorflow/contrib/verbs/grpc_verbs_client.h"
#include "tensorflow/contrib/verbs/verbs_service.pb.h"
-#include "tensorflow/core/common_runtime/bfc_allocator.h"
#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
#include "tensorflow/core/common_runtime/pool_allocator.h"
@@ -29,6 +28,7 @@ limitations under the License.
#include "tensorflow/core/distributed_runtime/session_mgr.h"
#include "tensorflow/core/framework/allocator_registry.h"
#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
namespace tensorflow {
@@ -256,74 +256,41 @@ void MRDeleter(ibv_mr* mr) {
}
}
-// TODO(byronyi): remove this class and its registration when the default
-// cpu_allocator() returns visitable allocator, or cpu_allocator() is no
-// longer in use.
-class BFCRdmaAllocator : public BFCAllocator {
- public:
- BFCRdmaAllocator()
- : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36,
- true, "cpu_rdma_bfc") {}
-};
-class BFCRdmaAllocatorFactory : public AllocatorFactory {
- public:
- Allocator* CreateAllocator() { return new BFCRdmaAllocator; }
-
- SubAllocator* CreateSubAllocator(int numa_node) {
- return new BasicCPUAllocator(numa_node);
- }
-};
-
-REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocatorFactory);
-
void RdmaMgr::InitAllocators() {
- RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_;
+ static std::once_flag flag;
+ std::call_once(
+ flag, [this]() { RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_; });
+}
- Allocator* allocators[] = {
-#if GOOGLE_CUDA
- GPUProcessState::singleton()->GetCUDAHostAllocator(0),
-#endif // GOOGLE_CUDA
- ProcessState::singleton()->GetCPUAllocator(0),
- cpu_allocator(),
+/*static*/ void RdmaMgr::RegMemVisitors() {
+ SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node,
+ size_t num_bytes) {
+ RdmaMemoryMgr::Singleton().InsertMemoryRegion(
+ ptr, num_bytes, strings::StrCat("CPU:", numa_node));
+ };
+ SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node,
+ size_t num_bytes) {
+ RdmaMemoryMgr::Singleton().EvictMemoryRegion(ptr, num_bytes);
};
- using namespace std::placeholders;
-
- std::set<Allocator*> instrumented_;
-
- // Host memory allocators
- for (Allocator* allocator : allocators) {
- VisitableAllocator::Visitor alloc_visitor =
- std::bind(&RdmaMemoryMgr::InsertMemoryRegion,
- &RdmaMemoryMgr::Singleton(), _1, _2, allocator->Name());
- VisitableAllocator::Visitor free_visitor = std::bind(
- &RdmaMemoryMgr::EvictMemoryRegion, &RdmaMemoryMgr::Singleton(), _1, _2);
-
- auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
- CHECK(visitable_allocator)
- << "is not visitable for instrumentation" << allocator->Name();
- // Make sure we don't instrument the same allocator twice
- if (instrumented_.find(allocator) == std::end(instrumented_)) {
- visitable_allocator->AddAllocVisitor(alloc_visitor);
- visitable_allocator->AddFreeVisitor(free_visitor);
- instrumented_.insert(allocator);
- LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name();
- }
- }
+ ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
+ ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
#if GOOGLE_CUDA
if (IsGDRAvailable()) {
// Note we don't free allocated GPU memory so there is no free visitor
int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1;
- char buf[8];
- sprintf(buf, "gpu");
- VisitableAllocator::Visitor cuda_alloc_visitor =
- std::bind(&RdmaMemoryMgr::InsertMemoryRegion,
- &RdmaMemoryMgr::Singleton(), _1, _2, std::string(buf));
-
+ SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id,
+ size_t num_bytes) {
+ RdmaMemoryMgr::Singleton().InsertMemoryRegion(
+ ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
+ };
GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
cuda_alloc_visitor);
+ GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id,
+ alloc_visitor);
+ GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor);
LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
}
#endif // GOOGLE_CUDA
diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h
index 9fffc335bb..74b92cc9a6 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.h
+++ b/tensorflow/contrib/verbs/rdma_mgr.h
@@ -39,6 +39,7 @@ class RdmaMgr {
void SetupChannels();
bool ConnectivityCheck();
void InitAllocators();
+ static void RegMemVisitors();
const string& local_worker() { return local_worker_; }
private:
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index 1a0b5028fe..61469686e4 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -76,8 +76,13 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
return Status::OK();
}
+namespace {
+std::once_call reg_mem_visitors_call;
+} // namespace
+
Status VerbsServer::Init(ServiceInitFunction service_func,
RendezvousMgrCreationFunction rendezvous_mgr_func) {
+ std::call_once(reg_mem_visitors_call, []() { RdmaMgr::RegMemVisitors(); });
Status s = GrpcServer::Init(service_func, rendezvous_mgr_func);
{
mutex_lock l(mu_);
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index d55bd8d7ed..9bcf5b0865 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2783,7 +2783,6 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
"common_runtime/step_stats_collector.h",
"common_runtime/threadpool_device.h",
"common_runtime/tracing_device.h",
- "common_runtime/visitable_allocator.h",
"common_runtime/process_state.h",
"common_runtime/pool_allocator.h",
"graph/gradients.h",
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 84c6285bbe..3843ea9e60 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -31,7 +31,7 @@ namespace tensorflow {
BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
bool allow_growth, const string& name)
- : suballocator_(sub_allocator),
+ : sub_allocator_(sub_allocator),
name_(name),
free_chunks_list_(kInvalidChunkHandle),
next_allocation_id_(1) {
@@ -72,7 +72,7 @@ BFCAllocator::~BFCAllocator() {
VLOG(2) << "Number of regions allocated: "
<< region_manager_.regions().size();
for (const auto& region : region_manager_.regions()) {
- suballocator_->Free(region.ptr(), region.memory_size());
+ sub_allocator_->Free(region.ptr(), region.memory_size());
}
for (BinNum b = 0; b < kNumBins; b++) {
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
// Try allocating.
size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
- void* mem_addr = suballocator_->Alloc(alignment, bytes);
+ void* mem_addr = sub_allocator_->Alloc(alignment, bytes);
if (mem_addr == nullptr && !started_backpedal_) {
// Only backpedal once.
started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
while (mem_addr == nullptr) {
bytes = RoundedBytes(bytes * kBackpedalFactor);
if (bytes < rounded_bytes) break;
- mem_addr = suballocator_->Alloc(alignment, bytes);
+ mem_addr = sub_allocator_->Alloc(alignment, bytes);
}
}
@@ -158,10 +158,6 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
// Insert the chunk into the right bin.
InsertFreeChunkIntoBin(h);
- // Invoke visitors on newly allocated region.
- for (const auto& visitor : region_visitors_) {
- visitor(mem_addr, bytes);
- }
return true;
}
@@ -490,15 +486,6 @@ void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) {
InsertFreeChunkIntoBin(coalesced_chunk);
}
-void BFCAllocator::AddAllocVisitor(Visitor visitor) {
- VLOG(1) << "AddVisitor";
- mutex_lock l(lock_);
- region_visitors_.push_back(visitor);
- for (const auto& region : region_manager_.regions()) {
- visitor(region.ptr(), region.memory_size());
- }
-}
-
bool BFCAllocator::TracksAllocationSizes() { return true; }
size_t BFCAllocator::RequestedSize(const void* ptr) {
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 20e1dab1d5..364071e066 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -23,7 +23,7 @@ limitations under the License.
#include <vector>
#include "tensorflow/core/common_runtime/allocator_retry.h"
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/lib/gtl/stl_util.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/macros.h"
@@ -42,7 +42,7 @@ namespace tensorflow {
// coalescing. One assumption we make is that the process using this
// allocator owns pretty much all of the memory, and that nearly
// all requests to allocate memory go through this interface.
-class BFCAllocator : public VisitableAllocator {
+class BFCAllocator : public Allocator {
public:
// Takes ownership of sub_allocator.
BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
@@ -55,11 +55,6 @@ class BFCAllocator : public VisitableAllocator {
const AllocationAttributes& allocation_attr) override;
void DeallocateRaw(void* ptr) override;
- void AddAllocVisitor(Visitor visitor) override;
-
- // Does nothing, because memory is never freed.
- void AddFreeVisitor(Visitor visitor) override {}
-
bool TracksAllocationSizes() override;
size_t RequestedSize(const void* ptr) override;
@@ -423,7 +418,7 @@ class BFCAllocator : public VisitableAllocator {
// of the available memory.
bool started_backpedal_ = false;
- std::unique_ptr<SubAllocator> suballocator_;
+ std::unique_ptr<SubAllocator> sub_allocator_;
string name_;
// Structures mutable after construction
@@ -435,9 +430,6 @@ class BFCAllocator : public VisitableAllocator {
// Pointer to head of linked list of free Chunks
ChunkHandle free_chunks_list_ GUARDED_BY(lock_);
- // Called once on each region, ASAP.
- std::vector<Visitor> region_visitors_ GUARDED_BY(lock_);
-
// Counter containing the next unique identifier to assign to a
// newly-created chunk.
int64 next_allocation_id_ GUARDED_BY(lock_);
diff --git a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
index 636cd43575..6bd29ef775 100644
--- a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
@@ -26,8 +26,12 @@ namespace tensorflow {
class CUDAHostAllocator : public SubAllocator {
public:
// Note: stream_exec cannot be null.
- explicit CUDAHostAllocator(se::StreamExecutor* stream_exec)
- : stream_exec_(stream_exec) {
+ explicit CUDAHostAllocator(se::StreamExecutor* stream_exec, int numa_node,
+ const std::vector<Visitor>& alloc_visitors,
+ const std::vector<Visitor>& free_visitors)
+ : SubAllocator(alloc_visitors, free_visitors),
+ stream_exec_(stream_exec),
+ numa_node_(numa_node) {
CHECK(stream_exec_ != nullptr);
}
~CUDAHostAllocator() override {}
@@ -39,19 +43,23 @@ class CUDAHostAllocator : public SubAllocator {
if (ptr == nullptr) {
LOG(WARNING) << "could not allocate pinned host memory of size: "
<< num_bytes;
+ return ptr;
}
+ VisitAlloc(ptr, numa_node_, num_bytes);
}
return ptr;
}
void Free(void* ptr, size_t num_bytes) override {
if (ptr != nullptr) {
+ VisitFree(ptr, numa_node_, num_bytes);
stream_exec_->HostMemoryDeallocate(ptr);
}
}
private:
se::StreamExecutor* stream_exec_; // not owned, non-null
+ const int numa_node_;
TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
};
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index 2d4c8d0201..44ffce77a1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -22,18 +22,15 @@ limitations under the License.
namespace tensorflow {
-GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
- const string& name)
- : GPUBFCAllocator(cuda_gpu_id, total_memory, GPUOptions(), name) {}
+GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator,
+ size_t total_memory, const string& name)
+ : GPUBFCAllocator(sub_allocator, total_memory, GPUOptions(), name) {}
-GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator,
+ size_t total_memory,
const GPUOptions& gpu_options,
const string& name)
- : BFCAllocator(
- new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(),
- gpu_options.per_process_gpu_memory_fraction() > 1.0 ||
- gpu_options.experimental().use_unified_memory()),
- total_memory, gpu_options.allow_growth(), name) {}
+ : BFCAllocator(sub_allocator, total_memory, gpu_options.allow_growth(),
+ name) {}
} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index f1cc2eace1..6b6de80734 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -31,28 +31,20 @@ limitations under the License.
namespace tensorflow {
-// A GPU memory allocator that implements a 'best-fit with coalescing'
-// algorithm.
-class GPUBFCAllocator : public BFCAllocator {
- public:
- // 'cuda_gpu_id' refers to the ID of the GPU device within
- // the process and must reference a valid ID in the process.
- GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
- const string& name);
- GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
- const GPUOptions& gpu_options, const string& name);
- virtual ~GPUBFCAllocator() {}
-
- TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
-};
-
// Suballocator for GPU memory.
class GPUMemAllocator : public SubAllocator {
public:
+ // 'cuda_gpu_id' refers to the ID of the GPU device within
+ // the process and must reference a valid ID in the process.
// Note: stream_exec cannot be null.
- explicit GPUMemAllocator(se::StreamExecutor* stream_exec,
- bool use_unified_memory)
- : stream_exec_(stream_exec), use_unified_memory_(use_unified_memory) {
+ explicit GPUMemAllocator(se::StreamExecutor* stream_exec, CudaGpuId gpu_id,
+ bool use_unified_memory,
+ const std::vector<Visitor>& alloc_visitors,
+ const std::vector<Visitor>& free_visitors)
+ : SubAllocator(alloc_visitors, free_visitors),
+ stream_exec_(stream_exec),
+ gpu_id_(gpu_id),
+ use_unified_memory_(use_unified_memory) {
CHECK(stream_exec_ != nullptr);
}
~GPUMemAllocator() override {}
@@ -65,12 +57,14 @@ class GPUMemAllocator : public SubAllocator {
} else {
ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque();
}
+ VisitAlloc(ptr, gpu_id_.value(), num_bytes);
}
return ptr;
}
void Free(void* ptr, size_t num_bytes) override {
if (ptr != nullptr) {
+ VisitFree(ptr, gpu_id_.value(), num_bytes);
if (use_unified_memory_) {
stream_exec_->UnifiedMemoryDeallocate(ptr);
} else {
@@ -82,11 +76,25 @@ class GPUMemAllocator : public SubAllocator {
private:
se::StreamExecutor* stream_exec_; // not owned, non-null
+ const CudaGpuId gpu_id_;
const bool use_unified_memory_ = false;
TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator);
};
+// A GPU memory allocator that implements a 'best-fit with coalescing'
+// algorithm.
+class GPUBFCAllocator : public BFCAllocator {
+ public:
+ GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory,
+ const string& name);
+ GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory,
+ const GPUOptions& gpu_options, const string& name);
+ ~GPUBFCAllocator() override {}
+
+ TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
+};
+
} // namespace tensorflow
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 67caeb3495..7112c3afd4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
#include <vector>
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/lib/core/threadpool.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -46,7 +47,11 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
}
TEST(GPUBFCAllocatorTest, NoDups) {
- GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
CheckStats(&a, 0, 0, 0, 0);
// Allocate a lot of raw pointers
@@ -75,7 +80,11 @@ TEST(GPUBFCAllocatorTest, NoDups) {
}
TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
- GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
// Allocate 256 raw pointers of sizes between 100 bytes and about
// a meg
random::PhiloxRandom philox(123, 17);
@@ -133,7 +142,11 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
}
TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
- GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
CheckStats(&a, 0, 0, 0, 0);
float* first_ptr = a.Allocate<float>(1024);
@@ -168,18 +181,30 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
}
TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
- GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
float* ptr = a.Allocate<float>(0);
EXPECT_EQ(nullptr, ptr);
}
TEST(GPUBFCAllocatorTest, TracksSizes) {
- GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
EXPECT_EQ(true, a.TracksAllocationSizes());
}
TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
- GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
float* t1 = a.Allocate<float>(1);
EXPECT_EQ(4, a.RequestedSize(t1));
EXPECT_EQ(256, a.AllocatedSize(t1));
@@ -187,8 +212,12 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
}
TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
// Configure a 1MiB byte limit
- GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc");
+ GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc");
float* first_ptr = a.Allocate<float>(1 << 6);
float* second_ptr = a.Allocate<float>(1 << 20);
@@ -203,7 +232,11 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
options.set_allow_growth(true);
// Max of 2GiB, but starts out small.
- GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc");
// Allocate 10 raw pointers of sizes between 100 bytes and about
// 64 megs.
@@ -264,8 +297,15 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
}
TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
- GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
- GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc");
+ sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc");
void* amem = a.AllocateRaw(1, 1);
void* bmem = b.AllocateRaw(1, 1 << 30);
a.DeallocateRaw(amem);
@@ -273,7 +313,11 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
}
static void BM_Allocation(int iters) {
- GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
// Exercise a few different allocation sizes
std::vector<size_t> sizes = {256, 4096, 16384, 524288,
512, 1048576, 10485760, 104857600,
@@ -289,7 +333,11 @@ static void BM_Allocation(int iters) {
BENCHMARK(BM_Allocation);
static void BM_AllocationThreaded(int iters, int num_threads) {
- GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
thread::ThreadPool pool(Env::Default(), "test", num_threads);
std::atomic_int_fast32_t count(iters);
mutex done_lock;
@@ -325,7 +373,11 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16);
// A more complex benchmark that defers deallocation of an object for
// "delay" allocations.
static void BM_AllocationDelayed(int iters, int delay) {
- GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
// Exercise a few different allocation sizes
std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024};
int size_index = 0;
@@ -363,7 +415,11 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
// only methods inside this class can access private members of BFCAllocator.
void TestBinDebugInfo() {
- GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
std::vector<void*> initial_ptrs;
std::vector<size_t> initial_ptrs_allocated_sizes;
@@ -441,7 +497,11 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
}
void TestLog2FloorNonZeroSlow() {
- GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc");
+ CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc");
EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0));
EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1));
EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 934a57a5fb..8e14f1ea75 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -27,7 +27,7 @@ limitations under the License.
namespace tensorflow {
-GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator,
+GPUcudaMallocAllocator::GPUcudaMallocAllocator(Allocator* allocator,
CudaGpuId cuda_gpu_id)
: base_allocator_(allocator) {
stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -60,14 +60,6 @@ void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) {
#endif // GOOGLE_CUDA
}
-void GPUcudaMallocAllocator::AddAllocVisitor(Visitor visitor) {
- return base_allocator_->AddAllocVisitor(visitor);
-}
-
-void GPUcudaMallocAllocator::AddFreeVisitor(Visitor visitor) {
- return base_allocator_->AddFreeVisitor(visitor);
-}
-
bool GPUcudaMallocAllocator::TracksAllocationSizes() { return false; }
} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 856fdc34b4..3d1d0ef481 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -19,7 +19,7 @@ limitations under the License.
#include <memory>
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/types.h"
@@ -29,20 +29,17 @@ namespace tensorflow {
// An allocator that wraps a GPU allocator and adds debugging
// functionality that verifies that users do not write outside their
// allocated memory.
-class GPUcudaMallocAllocator : public VisitableAllocator {
+class GPUcudaMallocAllocator : public Allocator {
public:
- explicit GPUcudaMallocAllocator(VisitableAllocator* allocator,
- CudaGpuId cuda_gpu_id);
+ explicit GPUcudaMallocAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id);
~GPUcudaMallocAllocator() override;
string Name() override { return "gpu_debug"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
void DeallocateRaw(void* ptr) override;
- void AddAllocVisitor(Visitor visitor) override;
- void AddFreeVisitor(Visitor visitor) override;
bool TracksAllocationSizes() override;
private:
- VisitableAllocator* base_allocator_ = nullptr; // owned
+ Allocator* base_allocator_ = nullptr; // owned
se::StreamExecutor* stream_exec_; // Not owned.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index e4c834b30d..6bad66dcec 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -73,7 +73,7 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
// -----------------------------------------------------------------------------
// GPUDebugAllocator
// -----------------------------------------------------------------------------
-GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator,
+GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
CudaGpuId cuda_gpu_id)
: base_allocator_(allocator) {
stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -111,14 +111,6 @@ void GPUDebugAllocator::DeallocateRaw(void* ptr) {
base_allocator_->DeallocateRaw(ptr);
}
-void GPUDebugAllocator::AddAllocVisitor(Visitor visitor) {
- return base_allocator_->AddAllocVisitor(visitor);
-}
-
-void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) {
- return base_allocator_->AddFreeVisitor(visitor);
-}
-
bool GPUDebugAllocator::TracksAllocationSizes() { return true; }
size_t GPUDebugAllocator::RequestedSize(const void* ptr) {
@@ -158,7 +150,7 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) {
// -----------------------------------------------------------------------------
// GPUNanResetAllocator
// -----------------------------------------------------------------------------
-GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator,
+GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator,
CudaGpuId cuda_gpu_id)
: base_allocator_(allocator) {
stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -200,14 +192,6 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
base_allocator_->DeallocateRaw(ptr);
}
-void GPUNanResetAllocator::AddAllocVisitor(Visitor visitor) {
- return base_allocator_->AddAllocVisitor(visitor);
-}
-
-void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) {
- return base_allocator_->AddFreeVisitor(visitor);
-}
-
size_t GPUNanResetAllocator::RequestedSize(const void* ptr) {
return base_allocator_->RequestedSize(ptr);
}
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 0f9b72040c..0f27ff4384 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -21,7 +21,7 @@ limitations under the License.
#include <unordered_map>
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/types.h"
@@ -31,16 +31,13 @@ namespace tensorflow {
// An allocator that wraps a GPU allocator and adds debugging
// functionality that verifies that users do not write outside their
// allocated memory.
-class GPUDebugAllocator : public VisitableAllocator {
+class GPUDebugAllocator : public Allocator {
public:
- explicit GPUDebugAllocator(VisitableAllocator* allocator,
- CudaGpuId cuda_gpu_id);
+ explicit GPUDebugAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id);
~GPUDebugAllocator() override;
string Name() override { return "gpu_debug"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
void DeallocateRaw(void* ptr) override;
- void AddAllocVisitor(Visitor visitor) override;
- void AddFreeVisitor(Visitor visitor) override;
bool TracksAllocationSizes() override;
size_t RequestedSize(const void* ptr) override;
size_t AllocatedSize(const void* ptr) override;
@@ -53,7 +50,7 @@ class GPUDebugAllocator : public VisitableAllocator {
bool CheckFooter(void* ptr);
private:
- VisitableAllocator* base_allocator_ = nullptr; // owned
+ Allocator* base_allocator_ = nullptr; // owned
se::StreamExecutor* stream_exec_; // Not owned.
@@ -63,23 +60,20 @@ class GPUDebugAllocator : public VisitableAllocator {
// An allocator that wraps a GPU allocator and resets the memory on
// allocation and free to 'NaN', helping to identify cases where the
// user forgets to initialize the memory.
-class GPUNanResetAllocator : public VisitableAllocator {
+class GPUNanResetAllocator : public Allocator {
public:
- explicit GPUNanResetAllocator(VisitableAllocator* allocator,
- CudaGpuId cuda_gpu_id);
+ explicit GPUNanResetAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id);
~GPUNanResetAllocator() override;
string Name() override { return "gpu_nan_reset"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
void DeallocateRaw(void* ptr) override;
- void AddAllocVisitor(Visitor visitor) override;
- void AddFreeVisitor(Visitor visitor) override;
size_t RequestedSize(const void* ptr) override;
size_t AllocatedSize(const void* ptr) override;
void GetStats(AllocatorStats* stats) override;
void ClearStats() override;
private:
- VisitableAllocator* base_allocator_ = nullptr; // owned
+ Allocator* base_allocator_ = nullptr; // owned
se::StreamExecutor* stream_exec_; // Not owned.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index 236a0afa0b..98283cd846 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -35,7 +35,10 @@ namespace {
TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
const CudaGpuId cuda_gpu_id(0);
- GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
cuda_gpu_id);
auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -59,7 +62,10 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
EXPECT_DEATH(
{
const CudaGpuId cuda_gpu_id(0);
- GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(),
+ cuda_gpu_id, false /*use_unified_memory*/, {}, {});
+ GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
cuda_gpu_id);
auto stream_exec =
GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -92,7 +98,10 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
EXPECT_DEATH(
{
const CudaGpuId cuda_gpu_id(0);
- GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(),
+ cuda_gpu_id, false /*use_unified_memory*/, {}, {});
+ GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
cuda_gpu_id);
auto stream_exec =
GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -122,7 +131,10 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
TEST(GPUDebugAllocatorTest, ResetToNan) {
const CudaGpuId cuda_gpu_id(0);
- GPUNanResetAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
cuda_gpu_id);
auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -163,8 +175,11 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
const CudaGpuId cuda_gpu_id(0);
// NaN reset must be the outer-most allocator.
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
GPUNanResetAllocator a(
- new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+ new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
cuda_gpu_id),
cuda_gpu_id);
auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -205,15 +220,21 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
TEST(GPUDebugAllocatorTest, TracksSizes) {
const CudaGpuId cuda_gpu_id(0);
- GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
+ GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
cuda_gpu_id);
EXPECT_EQ(true, a.TracksAllocationSizes());
}
TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
const CudaGpuId cuda_gpu_id(0);
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ false /*use_unified_memory*/, {}, {});
GPUNanResetAllocator a(
- new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+ new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
cuda_gpu_id),
cuda_gpu_id);
float* t1 = a.Allocate<float>(1);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 2763ac0d4a..50e61b7e00 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -41,7 +41,6 @@ limitations under the License.
#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
#include "tensorflow/core/common_runtime/gpu_device_context.h"
#include "tensorflow/core/common_runtime/local_device.h"
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/framework/device_base.h"
#include "tensorflow/core/framework/op_kernel.h"
@@ -285,6 +284,38 @@ BaseGPUDevice::~BaseGPUDevice() {
for (auto ctx : device_contexts_) ctx->Unref();
}
+// This should be idempotent if already initialized.
+Status BaseGPUDevice::InitScratchBuffers() {
+ mutex_lock l(scratch_init_mutex_);
+ if (scratch_.size() < max_streams_) {
+ for (int i = 0; i < max_streams_; i++) {
+ DCHECK(streams_[i]);
+ if (scratch_.size() > i && scratch_[i]) continue;
+ size_t scratch_buffer_size =
+ Eigen::kCudaScratchSize + sizeof(unsigned int);
+ void* scratch_buffer = gpu_allocator_->AllocateRaw(
+ Allocator::kAllocatorAlignment, scratch_buffer_size);
+ if (scratch_buffer == nullptr) {
+ return errors::FailedPrecondition(
+ "Failed to allocate scratch buffer for device ",
+ tf_gpu_id_.value());
+ }
+ se::DeviceMemory<char> mem(
+ se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
+
+ bool ok = executor_->SynchronousMemZero(
+ &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
+ if (!ok) {
+ return errors::FailedPrecondition(
+ "Failed to memcopy into scratch buffer for device ",
+ tf_gpu_id_.value());
+ }
+ scratch_.push_back(static_cast<char*>(scratch_buffer));
+ }
+ }
+ return Status::OK();
+}
+
Status BaseGPUDevice::Init(const SessionOptions& options) {
auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_);
if (!executor_status.status().ok()) {
@@ -303,27 +334,6 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
for (int i = 0; i < max_streams_; i++) {
streams_.push_back(StreamGroupFactory::Global().GetOrCreate(
tf_gpu_id_, i, executor_, options.config.gpu_options()));
-
- size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
- void* scratch_buffer = gpu_allocator_->AllocateRaw(
- Allocator::kAllocatorAlignment, scratch_buffer_size);
- if (scratch_buffer == nullptr) {
- return errors::FailedPrecondition(
- "Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
- }
- scratch_.push_back(static_cast<char*>(scratch_buffer));
-
- se::DeviceMemory<char> mem(
- se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
-
- bool ok = executor_->SynchronousMemZero(
- &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
- if (!ok) {
- return errors::FailedPrecondition(
- "Failed to memcopy into scratch buffer for device ",
- tf_gpu_id_.value());
- }
-
device_contexts_.push_back(new GPUDeviceContext(
i, streams_.back()->compute, streams_.back()->host_to_device,
streams_.back()->device_to_host, streams_.back()->device_to_device));
@@ -867,10 +877,11 @@ PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() {
return new ConcretePerOpGpuDevice();
}
-void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
- PerOpGpuDevice* device,
- DeviceContext* dc,
- Allocator* allocator) {
+Status BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
+ PerOpGpuDevice* device,
+ DeviceContext* dc,
+ Allocator* allocator) {
+ TF_RETURN_IF_ERROR(InitScratchBuffers());
if (dc) {
const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
const int stream_id = gpu_dc->stream_id();
@@ -881,6 +892,7 @@ void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
} else {
ReinitializeDevice(context, device, 0, allocator);
}
+ return Status::OK();
}
Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 56d03d7a8c..b3eea55758 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -86,8 +86,9 @@ class BaseGPUDevice : public LocalDevice {
// The caller owns the returned device.
PerOpGpuDevice* MakeGpuDevice() override;
- void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
- DeviceContext* dc, Allocator* allocator) override;
+ Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
+ DeviceContext* dc,
+ Allocator* allocator) override;
// Returns the CUDA GPU id of this device within the native driver system;
// e.g., for CUDA this is the ordinal of the GPU within the system.
@@ -125,6 +126,7 @@ class BaseGPUDevice : public LocalDevice {
class StreamGroupFactory;
gtl::InlinedVector<StreamGroup*, 4> streams_;
+ mutex scratch_init_mutex_;
gtl::InlinedVector<char*, 4> scratch_;
std::vector<GPUDeviceContext*> device_contexts_;
GpuDeviceInfo* gpu_device_info_ = nullptr;
@@ -135,6 +137,9 @@ class BaseGPUDevice : public LocalDevice {
std::unique_ptr<EventMgr> em_;
std::unique_ptr<thread::ThreadPool> thread_pool_;
+ // Initialize scractch buffers used by Eigen.
+ Status InitScratchBuffers();
+
void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
int stream_id, Allocator* allocator);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index b18688174d..9ec740fabe 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -76,12 +76,16 @@ GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
// This function is defined for debugging problems with the allocators.
GPUProcessState::~GPUProcessState() {
CHECK_EQ(this, instance_);
- for (auto p : gpu_allocators_) {
- delete p;
- }
instance_ = nullptr;
}
+int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
+ // Return the NUMA node associated with the GPU's StreamExecutor.
+ se::StreamExecutor* se =
+ GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
+ return se->GetDeviceDescription().numa_node();
+}
+
Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
TfGpuId tf_gpu_id,
size_t total_bytes) {
@@ -93,13 +97,10 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
gpu_allocators_.resize(tf_gpu_id.value() + 1);
- if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types)
- gpu_al_.resize(tf_gpu_id.value() + 1);
}
- if (gpu_allocators_[tf_gpu_id.value()] == nullptr) {
- VisitableAllocator* gpu_allocator;
-
+ AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
+ if (allocator_parts.allocator.get() == nullptr) {
// Validate allocator types.
if (!allocator_type.empty() && allocator_type != "BFC") {
LOG(ERROR) << "Invalid allocator type: " << allocator_type;
@@ -108,8 +109,17 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
CudaGpuId cuda_gpu_id;
TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id));
- gpu_allocator =
- new GPUBFCAllocator(cuda_gpu_id, total_bytes, options,
+ int bus_id = BusIdForGPU(tf_gpu_id);
+ while (bus_id >= gpu_visitors_.size()) {
+ gpu_visitors_.push_back({});
+ }
+ GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+ (options.per_process_gpu_memory_fraction() > 1.0 ||
+ options.experimental().use_unified_memory()),
+ gpu_visitors_[bus_id], {});
+ Allocator* gpu_allocator =
+ new GPUBFCAllocator(sub_allocator, total_bytes, options,
strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
// If true, checks for memory overwrites by writing
@@ -123,34 +133,25 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
// **WARNING** probably will not work in a multi-gpu scenario
gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, cuda_gpu_id);
}
- gpu_allocators_[tf_gpu_id.value()] = gpu_allocator;
-
- // If there are any pending AllocVisitors for this bus, add
- // them now.
- se::StreamExecutor* se =
- GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
- int bus_id = se->GetDeviceDescription().numa_node();
- if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
- for (const auto& v : gpu_visitors_[bus_id]) {
- gpu_allocator->AddAllocVisitor(v);
- }
- }
+
+ Allocator* recording_allocator = nullptr;
if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
ProcessState::MemDesc md;
md.loc = ProcessState::MemDesc::GPU;
md.dev_index = cuda_gpu_id.value();
md.gpu_registered = false;
md.nic_registered = true;
- if (static_cast<int64>(gpu_al_.size()) <= tf_gpu_id.value()) {
- gpu_al_.resize(tf_gpu_id.value() + 1);
- }
- gpu_al_[tf_gpu_id.value()] = new internal::RecordingAllocator(
+ recording_allocator = new internal::RecordingAllocator(
&process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
}
+ allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator), sub_allocator,
+ std::unique_ptr<Allocator>(recording_allocator)};
+ }
+ if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
+ return allocator_parts.recording_allocator.get();
+ } else {
+ return allocator_parts.allocator.get();
}
- if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types)
- return gpu_al_[tf_gpu_id.value()];
- return gpu_allocators_[tf_gpu_id.value()];
#else
LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda.";
return nullptr;
@@ -172,11 +173,12 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
tf_shared_lock lock(mu_);
if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
- static_cast<int>(cuda_al_.size()) > 0) {
- return cuda_al_[0];
+ !cuda_host_allocators_.empty() &&
+ cuda_host_allocators_[0].recording_allocator != nullptr) {
+ return cuda_host_allocators_[0].recording_allocator.get();
}
if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
- return cuda_host_allocators_[0];
+ return cuda_host_allocators_[0].allocator.get();
}
}
@@ -190,7 +192,7 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
// it knows is valid.
se::StreamExecutor* se = nullptr;
for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
- if (gpu_allocators_[i] != nullptr) {
+ if (gpu_allocators_[i].allocator != nullptr) {
se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
break;
}
@@ -199,6 +201,15 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
CHECK_NE(nullptr, se);
while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) {
+ while (cuda_host_alloc_visitors_.size() <= numa_node) {
+ cuda_host_alloc_visitors_.push_back({});
+ }
+ while (cuda_host_free_visitors_.size() <= numa_node) {
+ cuda_host_free_visitors_.push_back({});
+ }
+ SubAllocator* sub_allocator = new CUDAHostAllocator(
+ se, numa_node, cuda_host_alloc_visitors_[numa_node],
+ cuda_host_free_visitors_[numa_node]);
// TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
int64 cuda_host_mem_limit_in_mb = -1;
Status status = ReadInt64FromEnvVar("TF_CUDA_HOST_MEM_LIMIT_IN_MB",
@@ -208,62 +219,92 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
LOG(ERROR) << "GetCUDAHostAllocator: " << status.error_message();
}
int64 cuda_host_mem_limit = cuda_host_mem_limit_in_mb * (1LL << 20);
- VisitableAllocator* allocator =
- new BFCAllocator(new CUDAHostAllocator(se), cuda_host_mem_limit,
+ Allocator* allocator =
+ new BFCAllocator(sub_allocator, cuda_host_mem_limit,
true /*allow_growth*/, "cuda_host_bfc" /*name*/);
- if (LogMemory::IsEnabled()) {
+ if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
// Wrap the allocator to track allocation ids for better logging
// at the cost of performance.
- allocator = new TrackingVisitableAllocator(allocator, true);
+ allocator = new TrackingAllocator(allocator, true);
}
- cuda_host_allocators_.push_back(allocator);
+ cuda_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
+ sub_allocator,
+ std::unique_ptr<Allocator>(nullptr)});
+ AllocatorParts& allocator_parts = cuda_host_allocators_.back();
if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
ProcessState::MemDesc md;
md.loc = ProcessState::MemDesc::CPU;
md.dev_index = 0;
md.gpu_registered = true;
md.nic_registered = false;
- cuda_al_.push_back(new internal::RecordingAllocator(
- &process_state_->mem_desc_map_, cuda_host_allocators_.back(), md,
- &mu_));
+ allocator_parts.recording_allocator.reset(
+ new internal::RecordingAllocator(&process_state_->mem_desc_map_,
+ allocator_parts.allocator.get(), md,
+ &mu_));
}
}
- if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types)
- return cuda_al_[0];
- return cuda_host_allocators_[0];
+ if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
+ return cuda_host_allocators_[0].recording_allocator.get();
+ } else {
+ return cuda_host_allocators_[0].allocator.get();
+ }
}
void GPUProcessState::AddGPUAllocVisitor(int bus_id,
- const AllocVisitor& visitor) {
- CHECK(process_state_);
+ const SubAllocator::Visitor& visitor) {
#if GOOGLE_CUDA
mutex_lock lock(mu_);
- for (int i = 0; i < static_cast<int64>(gpu_allocators_.size()); ++i) {
- se::StreamExecutor* se =
- GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
- if (gpu_allocators_[i] &&
- (se->GetDeviceDescription().numa_node() + 1) == bus_id) {
- gpu_allocators_[i]->AddAllocVisitor(visitor);
- }
- }
+ CHECK(gpu_allocators_.empty()) // Crash OK
+ << "AddGPUAllocVisitor must be called before "
+ "first call to GetGPUAllocator.";
while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
- gpu_visitors_.push_back(std::vector<AllocVisitor>());
+ gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
}
gpu_visitors_[bus_id].push_back(visitor);
#endif // GOOGLE_CUDA
}
+void GPUProcessState::AddCUDAHostAllocVisitor(
+ int numa_node, const SubAllocator::Visitor& visitor) {
+#if GOOGLE_CUDA
+ mutex_lock lock(mu_);
+ CHECK(cuda_host_allocators_.empty()) // Crash OK
+ << "AddCUDAHostAllocVisitor must be called before "
+ "first call to GetCUDAHostAllocator.";
+ while (numa_node >= static_cast<int64>(cuda_host_alloc_visitors_.size())) {
+ cuda_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
+ }
+ cuda_host_alloc_visitors_[numa_node].push_back(visitor);
+#endif // GOOGLE_CUDA
+}
+
+void GPUProcessState::AddCUDAHostFreeVisitor(
+ int numa_node, const SubAllocator::Visitor& visitor) {
+#if GOOGLE_CUDA
+ mutex_lock lock(mu_);
+ CHECK(cuda_host_allocators_.empty()) // Crash OK
+ << "AddCUDAHostFreeVisitor must be called before "
+ "first call to GetCUDAHostAllocator.";
+ while (numa_node >= static_cast<int64>(cuda_host_free_visitors_.size())) {
+ cuda_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
+ }
+ cuda_host_free_visitors_[numa_node].push_back(visitor);
+#endif // GOOGLE_CUDA
+}
+
void GPUProcessState::TestOnlyReset() {
- process_state_->ProcessState::TestOnlyReset();
+ if (process_state_) {
+ process_state_->ProcessState::TestOnlyReset();
+ }
{
mutex_lock lock(mu_);
gpu_device_enabled_ = false;
+ gpu_allocators_.clear();
gpu_visitors_.clear();
- gtl::STLDeleteElements(&gpu_allocators_);
- gtl::STLDeleteElements(&cuda_host_allocators_);
- gtl::STLDeleteElements(&gpu_al_);
- gtl::STLDeleteElements(&cuda_al_);
+ cuda_host_allocators_.clear();
+ cuda_host_alloc_visitors_.clear();
+ cuda_host_free_visitors_.clear();
}
}
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index cb41c3c6bd..43e9a31660 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -32,7 +32,6 @@ limitations under the License.
namespace tensorflow {
class Allocator;
-class VisitableAllocator;
class PoolAllocator;
// Singleton that manages per-process state when GPUs are present.
@@ -72,18 +71,30 @@ class GPUProcessState {
virtual Allocator* GetCUDAHostAllocator(int numa_node);
- // Registers a function to be called once on every new Region
- // allocated by every GPURegionAllocator proximate to the specified
- // bus. The AllocVisitor is provided with a memory pointer and the
- // size of the area it identifies. The pointer is not guaranteed to
- // be valid after the call terminates. The intention is for this
- // interface to be used for network device memory registration.
- // "bus_id" is platform-specific. On many platforms it
- // should be 0. On machines with multiple PCIe buses, it should be
- // the index of one of the PCIe buses. If the bus_id is invalid,
- // results are undefined.
- typedef std::function<void(void*, size_t)> AllocVisitor;
- virtual void AddGPUAllocVisitor(int bus_id, const AllocVisitor& visitor);
+ // Registers a Visitor to be invoked on new chunks of memory allocated by the
+ // SubAllocator of every GPU proximate to the specified bus. The AllocVisitor
+ // is provided with a memory pointer, a GPU id, and the size of the area it
+ // identifies. The pointer is not guaranteed to be valid after the call
+ // terminates. The intention is for this interface to be used for network
+ // device memory registration. "bus_id" is platform-specific. On many
+ // platforms it should be 0. On machines with multiple PCIe buses, it should
+ // be the index of one of the PCIe buses (maybe the NUMA node at which the
+ // PCIe is rooted). If the bus_id is invalid, results are undefined.
+ virtual void AddGPUAllocVisitor(int bus_id,
+ const SubAllocator::Visitor& visitor);
+
+ // Registers a Visitor to be invoked on new chunks of memory allocated by
+ // the SubAllocator of the CUDAHostAllocator for the given numa_node.
+ virtual void AddCUDAHostAllocVisitor(int numa_node,
+ const SubAllocator::Visitor& visitor);
+
+ // Registers a Visitor to be invoked on each chunk handed back for freeing to
+ // the SubAllocator of the CUDAHostAllocator for the given numa_node.
+ virtual void AddCUDAHostFreeVisitor(int numa_node,
+ const SubAllocator::Visitor& visitor);
+
+ // Returns bus_id for the given GPU id.
+ virtual int BusIdForGPU(TfGpuId tf_gpu_id);
protected:
GPUProcessState();
@@ -103,16 +114,21 @@ class GPUProcessState {
mutex mu_;
- std::vector<VisitableAllocator*> gpu_allocators_ GUARDED_BY(mu_);
- std::vector<std::vector<AllocVisitor>> gpu_visitors_ GUARDED_BY(mu_);
- std::vector<Allocator*> cuda_host_allocators_ GUARDED_BY(mu_);
+ struct AllocatorParts {
+ std::unique_ptr<Allocator> allocator;
+ SubAllocator* sub_allocator; // owned by allocator
+ std::unique_ptr<Allocator> recording_allocator;
+ };
+ std::vector<AllocatorParts> gpu_allocators_ GUARDED_BY(mu_);
+ std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_ GUARDED_BY(mu_);
- virtual ~GPUProcessState();
+ std::vector<AllocatorParts> cuda_host_allocators_ GUARDED_BY(mu_);
+ std::vector<std::vector<SubAllocator::Visitor>> cuda_host_alloc_visitors_
+ GUARDED_BY(mu_);
+ std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_
+ GUARDED_BY(mu_);
- // Optional RecordingAllocators that wrap the corresponding
- // Allocators for runtime attribute use analysis.
- std::vector<Allocator*> gpu_al_ GUARDED_BY(mu_);
- std::vector<Allocator*> cuda_al_ GUARDED_BY(mu_);
+ virtual ~GPUProcessState();
friend class GPUDeviceTest;
};
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
index 583bff2c07..6b2f6547b0 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -31,7 +31,8 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) {
2 /*pool_size_limit*/, false /*auto_resize*/,
new CUDAHostAllocator(
platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
- .ValueOrDie()),
+ .ValueOrDie(),
+ 0 /*numa_node*/, {}, {}),
new NoopRounder, "pool");
EXPECT_EQ(nullptr, pool.AllocateRaw(4 /*alignment*/, 0 /*num_bytes*/));
@@ -49,7 +50,8 @@ TEST(PoolAllocatorTest, ZeroSizePool) {
0 /*pool_size_limit*/, false /*auto_resize*/,
new CUDAHostAllocator(
platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
- .ValueOrDie()),
+ .ValueOrDie(),
+ 0 /*numa_node*/, {}, {}),
new NoopRounder, "pool");
EXPECT_EQ(0, pool.get_from_pool_count());
@@ -82,7 +84,8 @@ TEST(PoolAllocatorTest, Alignment) {
0 /*pool_size_limit*/, false /*auto_resize*/,
new CUDAHostAllocator(
platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
- .ValueOrDie()),
+ .ValueOrDie(),
+ 0 /*numa_node*/, {}, {}),
new NoopRounder, "pool");
for (int i = 0; i < 16; ++i) {
size_t alignment = 1 << i;
@@ -97,8 +100,8 @@ TEST(PoolAllocatorTest, Alignment) {
TEST(PoolAllocatorTest, AutoResize) {
PoolAllocator pool(2 /*pool_size_limit*/, true /*auto_resize*/,
- new BasicCPUAllocator(0 /*numa_node*/), new NoopRounder,
- "pool");
+ new BasicCPUAllocator(0 /*numa_node*/, {}, {}),
+ new NoopRounder, "pool");
// Alloc/dealloc 10 sizes just a few times, confirming pool size
// stays at 2.
@@ -123,14 +126,32 @@ TEST(PoolAllocatorTest, AutoResize) {
}
TEST(PoolAllocatorTest, CudaHostAllocator) {
+ int alloc_count = 0;
+ int64 alloc_size = 0;
+ SubAllocator::Visitor alloc_visitor =
+ [&alloc_count, &alloc_size](void* ptr, int numa_node, int64 size) {
+ ++alloc_count;
+ alloc_size += size;
+ };
+ int free_count = 0;
+ int64 free_size = 0;
+ SubAllocator::Visitor free_visitor =
+ [&free_count, &free_size](void* ptr, int numa_node, int64 size) {
+ ++free_count;
+ free_size += size;
+ };
se::Platform* platform =
se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
- PoolAllocator pool(
- 2 /*pool_size_limit*/, false /*auto_resize*/,
- new CUDAHostAllocator(
- platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
- .ValueOrDie()),
- new NoopRounder, "pool");
+ CUDAHostAllocator* sub_allocator = new CUDAHostAllocator(
+ platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
+ .ValueOrDie(),
+ 0 /*numa_node*/, {alloc_visitor}, {free_visitor});
+ PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/,
+ sub_allocator, new NoopRounder, "pool");
+ EXPECT_EQ(0, alloc_count);
+ EXPECT_EQ(0, alloc_size);
+ EXPECT_EQ(0, free_count);
+ EXPECT_EQ(0, free_size);
// Repeatedly Get a 16-byte value, confirming that there's only
// one real allocation.
@@ -138,6 +159,10 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
EXPECT_EQ(0, pool.get_from_pool_count());
EXPECT_EQ(1, pool.allocated_count());
EXPECT_NE(nullptr, p1_16);
+ EXPECT_EQ(1, alloc_count); // Underlying suballoc of 16 bytes
+ // Each suballocation includes a 16B ChunkPrefix.
+ static const int kChunkPrefixSize = 16;
+ EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size);
pool.DeallocateRaw(p1_16);
// Pool contents {16}
EXPECT_EQ(1, pool.put_count());
@@ -148,6 +173,9 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
pool.DeallocateRaw(p2_16); // Put it back.
// Pool contents {16}
EXPECT_EQ(2, pool.put_count());
+ EXPECT_EQ(1, alloc_count); // Underlying suballoc of 16 bytes
+ EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size);
+ EXPECT_EQ(0, free_count);
// Get two more values of different sizes.
void* p3_4 = pool.AllocateRaw(4, 4);
@@ -160,6 +188,9 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
void* p4_2 = pool.AllocateRaw(4, 2); // Get a third size buffer.
EXPECT_NE(nullptr, p4_2);
EXPECT_EQ(0, pool.evicted_count());
+ EXPECT_EQ(3, alloc_count);
+ EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
+ EXPECT_EQ(0, free_count);
// The pool is full: when we put back p4_2, the 16-byte buffer
// should be evicted since it was least recently inserted.
@@ -167,6 +198,10 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
// Pool contents {2, 4}
EXPECT_EQ(4, pool.put_count());
EXPECT_EQ(1, pool.evicted_count());
+ EXPECT_EQ(3, alloc_count);
+ EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
+ EXPECT_EQ(1, free_count);
+ EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size);
// Re-getting and putting size 2 or 4 should not alter pool size or
// num-evicted.
@@ -180,12 +215,20 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
EXPECT_EQ(6, pool.put_count());
EXPECT_EQ(3, pool.allocated_count());
EXPECT_EQ(1, pool.evicted_count());
+ EXPECT_EQ(3, alloc_count);
+ EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
+ EXPECT_EQ(1, free_count);
+ EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size);
pool.Clear();
EXPECT_EQ(0, pool.get_from_pool_count());
EXPECT_EQ(0, pool.put_count());
EXPECT_EQ(0, pool.allocated_count());
EXPECT_EQ(0, pool.evicted_count());
+ EXPECT_EQ(3, alloc_count);
+ EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
+ EXPECT_EQ(3, free_count);
+ EXPECT_EQ(16 + 4 + 2 + (free_count * kChunkPrefixSize), free_size);
}
TEST(PoolAllocatorTest, Pow2Rounder) {
@@ -206,7 +249,8 @@ TEST(PoolAllocatorTest, Name) {
2 /*pool_size_limit*/, false /*auto_resize*/,
new CUDAHostAllocator(
platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
- .ValueOrDie()),
+ .ValueOrDie(),
+ 0 /*numa_node*/, {}, {}),
new NoopRounder, "pool");
EXPECT_EQ("pool", pool.Name());
}
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index df9c3a686c..538a70668a 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -23,12 +23,11 @@ limitations under the License.
#include <cstdlib>
#include "tensorflow/core/common_runtime/bfc_allocator.h"
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
-#include "tensorflow/core/framework/allocator_registry.h"
+#include "tensorflow/core/common_runtime/pool_allocator.h"
#include "tensorflow/core/lib/strings/numbers.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
#ifndef INTEL_MKL_DNN_ONLY
#include "i_malloc.h"
@@ -40,20 +39,16 @@ typedef unsigned int uint;
namespace tensorflow {
-class MklSubAllocator : public SubAllocator {
+class MklSubAllocator : public BasicCPUAllocator {
public:
+ MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {}
~MklSubAllocator() override {}
-
- void* Alloc(size_t alignment, size_t num_bytes) override {
- return port::AlignedMalloc(num_bytes, alignment);
- }
- void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
};
// CPU allocator that handles small-size allocations by calling
// suballocator directly. Mostly, it is just a wrapper around a suballocator
// (that calls malloc and free directly) with support for bookkeeping.
-class MklSmallSizeAllocator : public VisitableAllocator {
+class MklSmallSizeAllocator : public Allocator {
public:
MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory,
const string& name)
@@ -75,10 +70,6 @@ class MklSmallSizeAllocator : public VisitableAllocator {
CHECK(map_.insert(map_val).second);
// Increment statistics for small-size allocations.
IncrementStats(num_bytes);
- // Call alloc visitors.
- for (const auto& visitor : alloc_visitors_) {
- visitor(ptr, num_bytes);
- }
}
return ptr;
}
@@ -94,9 +85,6 @@ class MklSmallSizeAllocator : public VisitableAllocator {
if (map_iter != map_.end()) {
// Call free visitors.
size_t dealloc_bytes = map_iter->second;
- for (const auto& visitor : free_visitors_) {
- visitor(ptr, dealloc_bytes);
- }
sub_allocator_->Free(ptr, dealloc_bytes);
DecrementStats(dealloc_bytes);
map_.erase(map_iter);
@@ -121,16 +109,6 @@ class MklSmallSizeAllocator : public VisitableAllocator {
stats_.Clear();
}
- void AddAllocVisitor(Visitor visitor) override {
- mutex_lock l(mutex_);
- alloc_visitors_.push_back(visitor);
- }
-
- void AddFreeVisitor(Visitor visitor) override {
- mutex_lock l(mutex_);
- free_visitors_.push_back(visitor);
- }
-
private:
// Increment statistics for the allocator handling small allocations.
inline void IncrementStats(size_t alloc_size)
@@ -163,15 +141,11 @@ class MklSmallSizeAllocator : public VisitableAllocator {
// Allocator stats for small allocs
AllocatorStats stats_ GUARDED_BY(mutex_);
-
- // Visitors
- std::vector<Visitor> alloc_visitors_ GUARDED_BY(mutex_);
- std::vector<Visitor> free_visitors_ GUARDED_BY(mutex_);
};
/// CPU allocator for MKL that wraps BFC allocator and intercepts
/// and redirects memory allocation calls from MKL.
-class MklCPUAllocator : public VisitableAllocator {
+class MklCPUAllocator : public Allocator {
public:
// Constructor and other standard functions
@@ -284,16 +258,6 @@ class MklCPUAllocator : public VisitableAllocator {
large_size_allocator_->ClearStats();
}
- void AddAllocVisitor(Visitor visitor) override {
- small_size_allocator_->AddAllocVisitor(visitor);
- large_size_allocator_->AddAllocVisitor(visitor);
- }
-
- void AddFreeVisitor(Visitor visitor) override {
- small_size_allocator_->AddFreeVisitor(visitor);
- large_size_allocator_->AddFreeVisitor(visitor);
- }
-
private:
// Hooks provided by this allocator for memory allocation routines from MKL
@@ -330,7 +294,7 @@ class MklCPUAllocator : public VisitableAllocator {
// The alignment that we need for the allocations
static constexpr const size_t kAlignment = 64;
- VisitableAllocator* large_size_allocator_; // owned by this class
+ Allocator* large_size_allocator_; // owned by this class
MklSmallSizeAllocator* small_size_allocator_; // owned by this class.
SubAllocator* sub_allocator_; // not owned by this class
diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc
index fdad8de8d6..66dc8f3322 100644
--- a/tensorflow/core/common_runtime/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/pool_allocator.cc
@@ -40,8 +40,7 @@ PoolAllocator::PoolAllocator(size_t pool_size_limit, bool auto_resize,
auto_resize_(auto_resize),
pool_size_limit_(pool_size_limit),
allocator_(allocator),
- size_rounder_(size_rounder),
- allocation_begun_(false) {
+ size_rounder_(size_rounder) {
if (auto_resize) {
CHECK_LT(size_t{0}, pool_size_limit)
<< "size limit must be > 0 if auto_resize is true.";
@@ -93,7 +92,6 @@ ChunkPrefix* FindPrefix(void* user_ptr) {
} // namespace
void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
- if (!allocation_begun_) allocation_begun_ = true;
if (num_bytes == 0) return nullptr;
// If alignment is larger than kPoolAlignment, increase num_bytes so that we
@@ -129,9 +127,6 @@ void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
return PrepareChunk(r, alignment, num_bytes);
} else {
void* ptr = allocator_->Alloc(kPoolAlignment, num_bytes);
- for (const auto& v : alloc_visitors_) {
- v(ptr, num_bytes);
- }
return PrepareChunk(ptr, alignment, num_bytes);
}
}
@@ -141,9 +136,6 @@ void PoolAllocator::DeallocateRaw(void* ptr) {
ChunkPrefix* cp = FindPrefix(ptr);
CHECK_LE((void*)cp, (void*)ptr);
if (!has_size_limit_ && !auto_resize_) {
- for (const auto& v : free_visitors_) {
- v(cp, cp->num_bytes);
- }
allocator_->Free(cp, cp->num_bytes);
} else {
mutex_lock lock(mutex_);
@@ -164,9 +156,6 @@ void PoolAllocator::Clear() {
mutex_lock lock(mutex_);
for (auto iter : pool_) {
PtrRecord* pr = iter.second;
- for (const auto& v : free_visitors_) {
- v(pr->ptr, pr->num_bytes);
- }
allocator_->Free(pr->ptr, pr->num_bytes);
delete pr;
}
@@ -221,9 +210,6 @@ void PoolAllocator::EvictOne() {
DCHECK(iter != pool_.end());
}
pool_.erase(iter);
- for (const auto& v : free_visitors_) {
- v(prec->ptr, prec->num_bytes);
- }
allocator_->Free(prec->ptr, prec->num_bytes);
delete prec;
++evicted_count_;
@@ -269,28 +255,19 @@ void PoolAllocator::EvictOne() {
}
}
-void PoolAllocator::AddAllocVisitor(Visitor visitor) {
- mutex_lock lock(mutex_);
- CHECK(!allocation_begun_)
- << "AddAllocVisitor may not be called after pool allocation "
- << "has begun.";
- alloc_visitors_.push_back(visitor);
-}
-
-void PoolAllocator::AddFreeVisitor(Visitor visitor) {
- mutex_lock lock(mutex_);
- CHECK(!allocation_begun_)
- << "AddFreeVisitor may not be called after pool allocation "
- << "has begun.";
- free_visitors_.push_back(visitor);
-}
-
void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) {
- return port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+ void* ptr = nullptr;
+ if (num_bytes > 0) {
+ ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+ VisitAlloc(ptr, numa_node_, num_bytes);
+ }
+ return ptr;
}
void BasicCPUAllocator::Free(void* ptr, size_t num_bytes) {
- port::AlignedFree(ptr);
+ if (num_bytes > 0) {
+ VisitFree(ptr, numa_node_, num_bytes);
+ port::AlignedFree(ptr);
+ }
}
-
} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 607734445b..5b4623ba10 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -16,14 +16,13 @@ limitations under the License.
#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_
#define TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_
-// Simple LRU pool allocators for various flavors of CPU RAM that
-// implement the VisitableAllocator interface.
+// Simple LRU pool allocators for various flavors of CPU RAM.
#include <atomic>
#include <map>
#include <memory>
#include <vector>
-#include "tensorflow/core/common_runtime/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/lib/core/bits.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
@@ -41,7 +40,7 @@ class RoundUpInterface {
// Size-limited pool of memory buffers obtained from a SubAllocator
// instance. Pool eviction policy is LRU.
-class PoolAllocator : public VisitableAllocator {
+class PoolAllocator : public Allocator {
public:
// "pool_size_limit" is the maximum number of returned, re-usable
// memory buffers to keep in the pool. If pool_size_limit == 0, the
@@ -64,14 +63,6 @@ class PoolAllocator : public VisitableAllocator {
void DeallocateRaw(void* ptr) override;
- // REQUIRES: The following functions may only be called prior
- // to the first Allocate*() call. Once allocation has begun, it is
- // illegal to register another visitor.
-
- void AddAllocVisitor(Visitor visitor) override;
-
- void AddFreeVisitor(Visitor visitor) override;
-
// Allocate an unused memory region of size "num_bytes". Fetch from
// the pool if available, otherwise call allocator_.
void* Get(size_t num_bytes);
@@ -141,12 +132,6 @@ class PoolAllocator : public VisitableAllocator {
int64 put_count_ GUARDED_BY(mutex_) = 0;
int64 allocated_count_ GUARDED_BY(mutex_) = 0;
int64 evicted_count_ GUARDED_BY(mutex_) = 0;
- // Write access to these is guarded by mutex_, but not read
- // access. They may only be modified prior to the first
- // allocation. Later attempts to modify will fail.
- std::vector<Visitor> alloc_visitors_;
- std::vector<Visitor> free_visitors_;
- std::atomic<bool> allocation_begun_;
};
// Do-nothing rounder. Passes through sizes unchanged.
@@ -166,7 +151,9 @@ class Pow2Rounder : public RoundUpInterface {
class BasicCPUAllocator : public SubAllocator {
public:
// Argument numa_node is currently ignored.
- explicit BasicCPUAllocator(int numa_node) : numa_node_(numa_node) {}
+ BasicCPUAllocator(int numa_node, const std::vector<Visitor>& alloc_visitors,
+ const std::vector<Visitor>& free_visitors)
+ : SubAllocator(alloc_visitors, free_visitors), numa_node_(numa_node) {}
~BasicCPUAllocator() override {}
@@ -176,6 +163,8 @@ class BasicCPUAllocator : public SubAllocator {
private:
int numa_node_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(BasicCPUAllocator);
};
} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index 447338e7bd..bcaa37fc8a 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -71,20 +71,28 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
return MemDesc();
}
-VisitableAllocator* ProcessState::GetCPUAllocator(int numa_node) {
+Allocator* ProcessState::GetCPUAllocator(int numa_node) {
CHECK_GE(numa_node, 0);
if (!numa_enabled_) numa_node = 0;
mutex_lock lock(mu_);
while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
+ // If visitors have been defined we need an Allocator built from
+ // a SubAllocator. Prefer BFCAllocator, but fall back to PoolAllocator
+ // depending on env var setting.
+ const bool alloc_visitors_defined =
+ (!cpu_alloc_visitors_.empty() || !cpu_free_visitors_.empty());
bool use_bfc_allocator = false;
- // TODO(reedwm): Switch default to BGFAllocator if it's at least as fast and
- // efficient.
- Status status = ReadBoolFromEnvVar("TF_CPU_ALLOCATOR_USE_BFC", false,
- &use_bfc_allocator);
+ Status status = ReadBoolFromEnvVar(
+ "TF_CPU_ALLOCATOR_USE_BFC", alloc_visitors_defined, &use_bfc_allocator);
if (!status.ok()) {
LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
}
- VisitableAllocator* allocator;
+ Allocator* allocator = nullptr;
+ SubAllocator* sub_allocator =
+ (alloc_visitors_defined || use_bfc_allocator)
+ ? new BasicCPUAllocator(numa_enabled_ ? numa_node : -1,
+ cpu_alloc_visitors_, cpu_free_visitors_)
+ : nullptr;
if (use_bfc_allocator) {
// TODO(reedwm): evaluate whether 64GB by default is the best choice.
int64 cpu_mem_limit_in_mb = -1;
@@ -95,34 +103,63 @@ VisitableAllocator* ProcessState::GetCPUAllocator(int numa_node) {
LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
}
int64 cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20);
- allocator = new BFCAllocator(
- new BasicCPUAllocator(numa_enabled_ ? numa_node : -1), cpu_mem_limit,
- true /*allow_growth*/, "bfc_cpu_allocator_for_gpu" /*name*/);
+ DCHECK(sub_allocator);
+ allocator =
+ new BFCAllocator(sub_allocator, cpu_mem_limit, true /*allow_growth*/,
+ "bfc_cpu_allocator_for_gpu" /*name*/);
VLOG(2) << "Using BFCAllocator with memory limit of "
<< cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator";
- } else {
- allocator = new PoolAllocator(
- 100 /*pool_size_limit*/, true /*auto_resize*/,
- new BasicCPUAllocator(numa_enabled_ ? numa_node : -1),
- new NoopRounder, "cpu_pool");
+ } else if (alloc_visitors_defined) {
+ DCHECK(sub_allocator);
+ allocator =
+ new PoolAllocator(100 /*pool_size_limit*/, true /*auto_resize*/,
+ sub_allocator, new NoopRounder, "cpu_pool");
VLOG(2) << "Using PoolAllocator for ProcessState CPU allocator "
<< "numa_enabled_=" << numa_enabled_
<< " numa_node=" << numa_node;
+ } else {
+ DCHECK(!sub_allocator);
+ allocator = cpu_allocator();
}
- if (LogMemory::IsEnabled()) {
+ if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
// Wrap the allocator to track allocation ids for better logging
// at the cost of performance.
- allocator = new TrackingVisitableAllocator(allocator, true);
+ allocator = new TrackingAllocator(allocator, true);
}
cpu_allocators_.push_back(allocator);
+ if (!sub_allocator) {
+ DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty());
+ }
}
return cpu_allocators_[numa_node];
}
+void ProcessState::AddCPUAllocVisitor(SubAllocator::Visitor visitor) {
+ VLOG(1) << "AddCPUAllocVisitor";
+ mutex_lock lock(mu_);
+ CHECK_EQ(0, cpu_allocators_.size()) // Crash OK
+ << "AddCPUAllocVisitor must be called prior to first call to "
+ "ProcessState::GetCPUAllocator";
+ cpu_alloc_visitors_.push_back(std::move(visitor));
+}
+
+void ProcessState::AddCPUFreeVisitor(SubAllocator::Visitor visitor) {
+ mutex_lock lock(mu_);
+ CHECK_EQ(0, cpu_allocators_.size()) // Crash OK
+ << "AddCPUFreeVisitor must be called prior to first call to "
+ "ProcessState::GetCPUAllocator";
+ cpu_free_visitors_.push_back(std::move(visitor));
+}
+
void ProcessState::TestOnlyReset() {
mutex_lock lock(mu_);
+ // Don't delete this value because it's static.
+ Allocator* default_cpu_allocator = cpu_allocator();
mem_desc_map_.clear();
- gtl::STLDeleteElements(&cpu_allocators_);
+ for (Allocator* a : cpu_allocators_) {
+ if (a != default_cpu_allocator) delete a;
+ }
+ cpu_allocators_.clear();
gtl::STLDeleteElements(&cpu_al_);
}
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index 2892677333..cac312d849 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -30,7 +30,6 @@ limitations under the License.
namespace tensorflow {
class Allocator;
-class VisitableAllocator;
class PoolAllocator;
// Singleton that manages per-process state, e.g. allocation of
@@ -65,7 +64,15 @@ class ProcessState {
// Returns the one CPUAllocator used for the given numa_node.
// TEMPORARY: ignores numa_node.
- VisitableAllocator* GetCPUAllocator(int numa_node);
+ Allocator* GetCPUAllocator(int numa_node);
+
+ // Registers alloc visitor for the CPU allocator(s).
+ // REQUIRES: must be called before GetCPUAllocator.
+ void AddCPUAllocVisitor(SubAllocator::Visitor v);
+
+ // Registers free visitor for the CPU allocator(s).
+ // REQUIRES: must be called before GetCPUAllocator.
+ void AddCPUFreeVisitor(SubAllocator::Visitor v);
typedef std::unordered_map<const void*, MemDesc> MDMap;
@@ -87,7 +94,9 @@ class ProcessState {
mutex mu_;
- std::vector<VisitableAllocator*> cpu_allocators_ GUARDED_BY(mu_);
+ std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_);
+ std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_);
+ std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_);
virtual ~ProcessState();
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index 103eee03b3..9d59264899 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -72,9 +72,10 @@ class RenamedDevice : public Device {
return underlying_->MakeGpuDevice();
}
- void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
- DeviceContext* dc, Allocator* allocator) override {
- underlying_->ReinitializeGpuDevice(context, device, dc, allocator);
+ Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
+ DeviceContext* dc,
+ Allocator* allocator) override {
+ return underlying_->ReinitializeGpuDevice(context, device, dc, allocator);
}
Status MakeTensorFromProto(const TensorProto& tensor_proto,
diff --git a/tensorflow/core/common_runtime/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h
deleted file mode 100644
index ae0563a96a..0000000000
--- a/tensorflow/core/common_runtime/visitable_allocator.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
-
-#include <functional>
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/tracking_allocator.h"
-
-namespace tensorflow {
-
-// Subclass VisitableAllocator instead of Allocator when a memory
-// allocator needs to enable some kind of registration/deregistration
-// of memory areas.
-class VisitableAllocator : public Allocator {
- public:
- // Visitor gets called with a pointer to a memory area and its
- // size in bytes.
- typedef std::function<void(void*, size_t)> Visitor;
-
- // Register a visitor guaranteed to be called exactly once on each
- // chunk of memory newly allocated from the underlying device.
- // Typically, chunks will be reused and possibly sub-divided by a
- // pool manager, so the calls will happen only once per process
- // execution, not once per tensor (re)allocation.
- virtual void AddAllocVisitor(Visitor visitor) = 0;
-
- // Register a visitor guaranteed to be called on each chunk of
- // memory returned to the underlying device.
- virtual void AddFreeVisitor(Visitor visitor) = 0;
-};
-
-// Needed for cases when a VisitableAllocator gets wrapped for tracking.
-// Multiple-inheritance is considered acceptable in this case because
-// VisitableAllocator is a pure virtual interface and only TrackingAllocator
-// has default implementation.
-class TrackingVisitableAllocator : public TrackingAllocator,
- public VisitableAllocator {
- public:
- TrackingVisitableAllocator(VisitableAllocator* allocator, bool track_ids)
- : TrackingAllocator(allocator, track_ids), allocator_(allocator) {}
- ~TrackingVisitableAllocator() override {}
-
- string Name() override { return TrackingAllocator::Name(); }
-
- void* AllocateRaw(size_t alignment, size_t num_bytes) override {
- return TrackingAllocator::AllocateRaw(alignment, num_bytes);
- }
-
- void DeallocateRaw(void* ptr) override {
- TrackingAllocator::DeallocateRaw(ptr);
- }
-
- void AddAllocVisitor(Visitor visitor) override {
- allocator_->AddAllocVisitor(visitor);
- }
-
- void AddFreeVisitor(Visitor visitor) override {
- allocator_->AddFreeVisitor(visitor);
- }
-
- protected:
- VisitableAllocator* allocator_;
-};
-} // namespace tensorflow
-#endif // TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 2a7ee16a16..84cee5569c 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -196,7 +196,7 @@ class CPUAllocatorFactory : public AllocatorFactory {
class CPUSubAllocator : public SubAllocator {
public:
explicit CPUSubAllocator(CPUAllocator* cpu_allocator)
- : cpu_allocator_(cpu_allocator) {}
+ : SubAllocator({}, {}), cpu_allocator_(cpu_allocator) {}
void* Alloc(size_t alignment, size_t num_bytes) override {
return cpu_allocator_->AllocateRaw(alignment, num_bytes);
@@ -222,4 +222,22 @@ Allocator* cpu_allocator() {
}
return cpu_alloc;
}
+
+SubAllocator::SubAllocator(const std::vector<Visitor>& alloc_visitors,
+ const std::vector<Visitor>& free_visitors)
+ : alloc_visitors_(alloc_visitors), free_visitors_(free_visitors) {}
+
+void SubAllocator::VisitAlloc(void* ptr, int index, size_t num_bytes) {
+ for (const auto& v : alloc_visitors_) {
+ v(ptr, index, num_bytes);
+ }
+}
+
+void SubAllocator::VisitFree(void* ptr, int index, size_t num_bytes) {
+ // Although we don't guarantee any order of visitor application, strive
+ // to apply free visitors in reverse order of alloc visitors.
+ for (int i = free_visitors_.size() - 1; i >= 0; --i) {
+ free_visitors_[i](ptr, index, num_bytes);
+ }
+}
} // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index ded120b704..8c23604625 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -24,6 +24,7 @@ limitations under the License.
#include "tensorflow/core/framework/resource_handle.h"
#include "tensorflow/core/framework/type_traits.h"
#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/types.h"
namespace tensorflow {
@@ -387,13 +388,36 @@ void EnableCPUAllocatorStats(bool enable);
// full statistics. By default, it's disabled.
void EnableCPUAllocatorFullStats(bool enable);
-// Abstract interface of an object that does the underlying suballoc/free of
-// memory for a higher-level allocator.
+// An object that does the underlying suballoc/free of memory for a higher-level
+// allocator. The expectation is that the higher-level allocator is doing some
+// kind of cache or pool management so that it will call SubAllocator::Alloc and
+// Free relatively infrequently, compared to the number of times its own
+// AllocateRaw and Free methods are called.
class SubAllocator {
public:
+ // Visitor gets called with a pointer to a memory area and its
+ // size in bytes. The index value will be numa_node for a CPU
+ // allocator and GPU id for a GPU allocator.
+ typedef std::function<void(void*, int index, size_t)> Visitor;
+
+ SubAllocator(const std::vector<Visitor>& alloc_visitors,
+ const std::vector<Visitor>& free_visitors);
+
virtual ~SubAllocator() {}
virtual void* Alloc(size_t alignment, size_t num_bytes) = 0;
virtual void Free(void* ptr, size_t num_bytes) = 0;
+
+ protected:
+ // Implementation of Alloc() method must call this on newly allocated
+ // value.
+ void VisitAlloc(void* ptr, int index, size_t num_bytes);
+
+ // Implementation of Free() method must call this on value to be
+ // freed immediately before deallocation.
+ void VisitFree(void* ptr, int index, size_t num_bytes);
+
+ const std::vector<Visitor> alloc_visitors_;
+ const std::vector<Visitor> free_visitors_;
};
} // namespace tensorflow
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 794250a2c1..53ac639b4c 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -214,10 +214,12 @@ class DeviceBase {
// This is overridden by GPU devices to reinitialize the derived
// type returned by MakeGpuDevice.
- virtual void ReinitializeGpuDevice(OpKernelContext* /*context*/,
- PerOpGpuDevice* /*device*/,
- DeviceContext* /*dc*/,
- Allocator* /*allocator*/) {}
+ virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/,
+ PerOpGpuDevice* /*device*/,
+ DeviceContext* /*dc*/,
+ Allocator* /*allocator*/) {
+ return Status::OK();
+ }
// Unimplemented by default
virtual const DeviceAttributes& attributes() const;
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 80f2b12987..3e34bf0418 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -265,9 +265,12 @@ OpKernelContext::OpKernelContext(Params* params, int num_outputs)
params_->ensure_eigen_gpu_device();
if (params_->eigen_gpu_device != nullptr) {
Allocator* eigen_gpu_allocator = get_allocator(AllocatorAttributes());
- params_->device->ReinitializeGpuDevice(this, params_->eigen_gpu_device,
- params_->op_device_context,
- eigen_gpu_allocator);
+ Status s = params_->device->ReinitializeGpuDevice(
+ this, params_->eigen_gpu_device, params_->op_device_context,
+ eigen_gpu_allocator);
+ if (!s.ok()) {
+ SetStatus(s);
+ }
}
if (params_->record_tensor_accesses) {
referenced_tensors_.Init();