diff options
32 files changed, 577 insertions, 628 deletions
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc index bb06f1c41c..726f74c7b7 100644 --- a/tensorflow/contrib/gdr/gdr_memory_manager.cc +++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc @@ -138,8 +138,6 @@ class GdrMemoryManager : public RemoteMemoryManager { Device* device, DeviceContext* device_context, bool on_host, StatusCallback done) override; - static void RegMemVisitors(); - protected: Status CreateEndpoint(const string& host, const string& port, RdmaEndpointPtr& endpoint); @@ -185,51 +183,35 @@ class GdrMemoryManager : public RemoteMemoryManager { TF_DISALLOW_COPY_AND_ASSIGN(GdrMemoryManager); }; +// TODO(byronyi): remove this class and its registration when the default +// cpu_allocator() returns visitable allocator, or cpu_allocator() is no +// longer in use. +class BFCGdrAllocator : public BFCAllocator { + public: + BFCGdrAllocator() + : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36, + true, "cpu_gdr_bfc") {} +}; +class BFCGdrAllocatorFactory : public AllocatorFactory { + public: + Allocator* CreateAllocator() override { return new BFCGdrAllocator; } + + virtual SubAllocator* CreateSubAllocator(int numa_node) { + return new BasicCPUAllocator(numa_node); + } +}; + +REGISTER_MEM_ALLOCATOR("BFCGdrAllocator", 102, BFCGdrAllocatorFactory); + GdrMemoryManager::GdrMemoryManager(const string& host, const string& port) : host_(host), port_(port), listening_(nullptr, EndpointDeleter), stopped_(true), - next_key_(0) { - static std::once_flag flag; - std::call_once(flag, []() { RegMemVisitors(); }); -} + next_key_(0) {} GdrMemoryManager::~GdrMemoryManager() { close(epfd_); } -/*static*/ void GdrMemoryManager::RegMemVisitors() { - SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node, - size_t num_bytes) { - GdrMemoryManager::Singleton().InsertMemoryRegion( - ptr, num_bytes, strings::StrCat("CPU:", numa_node)); - }; - SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node, - size_t num_bytes) { - GdrMemoryManager::Singleton().EvictMemoryRegion(ptr, num_bytes); - }; - ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor); - ProcessState::singleton()->AddCPUFreeVisitor(free_visitor); - -#if GOOGLE_CUDA - if (IsGDRAvailable()) { - int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1; - - // Note we don't free allocated GPU memory so there is no free visitor - SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id, - size_t num_bytes) { - RdmaMemoryMgr::Singleton().InsertMemoryRegion( - ptr, num_bytes, strings::StrCat("GPU:", gpu_id)); - }; - GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id, - cuda_alloc_visitor); - GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id, - alloc_visitor); - GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor); - LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id; - } -#endif // GOOGLE_CUDA -} - Status GdrMemoryManager::Init() { epfd_ = epoll_create1(0); if (epfd_ == -1) { @@ -289,6 +271,48 @@ Status GdrMemoryManager::Init() { "cannot add server to epoll"); } + Allocator* allocators[] = { +#if GOOGLE_CUDA + GPUProcessState::singleton()->GetCUDAHostAllocator(0), +#endif // GOOGLE_CUDA + ProcessState::singleton()->GetCPUAllocator(0), + cpu_allocator(), + }; + + using namespace std::placeholders; + VisitableAllocator::Visitor alloc_visitor = + std::bind(&GdrMemoryManager::InsertMemoryRegion, this, _1, _2); + VisitableAllocator::Visitor free_visitor = + std::bind(&GdrMemoryManager::EvictMemoryRegion, this, _1, _2); + + std::set<Allocator*> instrumented_; + + // Host memory allocators + for (Allocator* allocator : allocators) { + auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator); + CHECK(visitable_allocator) + << "is not visitable for instrumentation" << allocator->Name(); + // Make sure we don't instrument the same allocator twice + if (instrumented_.find(allocator) == std::end(instrumented_)) { + visitable_allocator->AddAllocVisitor(alloc_visitor); + visitable_allocator->AddFreeVisitor(free_visitor); + instrumented_.insert(allocator); + LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name(); + } + } + +#if GOOGLE_CUDA + VisitableAllocator::Visitor cuda_alloc_visitor = + std::bind(&GdrMemoryManager::InsertMemoryRegion, this, _1, _2); + if (IsGDRAvailable()) { + // Note we don't free allocated GPU memory so there is no free visitor + int32_t bus_id = TryToReadNumaNode(listening_->verbs->device) + 1; + GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id, + cuda_alloc_visitor); + LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id; + } +#endif // GOOGLE_CUDA + return Status::OK(); } diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc index 2784bf124c..3cb5e61fac 100644 --- a/tensorflow/contrib/verbs/rdma_mgr.cc +++ b/tensorflow/contrib/verbs/rdma_mgr.cc @@ -20,6 +20,7 @@ limitations under the License. #include <vector> #include "tensorflow/contrib/verbs/grpc_verbs_client.h" #include "tensorflow/contrib/verbs/verbs_service.pb.h" +#include "tensorflow/core/common_runtime/bfc_allocator.h" #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h" #include "tensorflow/core/common_runtime/gpu/gpu_util.h" #include "tensorflow/core/common_runtime/pool_allocator.h" @@ -28,7 +29,6 @@ limitations under the License. #include "tensorflow/core/distributed_runtime/session_mgr.h" #include "tensorflow/core/framework/allocator_registry.h" #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/lib/strings/strcat.h" namespace tensorflow { @@ -256,41 +256,74 @@ void MRDeleter(ibv_mr* mr) { } } +// TODO(byronyi): remove this class and its registration when the default +// cpu_allocator() returns visitable allocator, or cpu_allocator() is no +// longer in use. +class BFCRdmaAllocator : public BFCAllocator { + public: + BFCRdmaAllocator() + : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36, + true, "cpu_rdma_bfc") {} +}; +class BFCRdmaAllocatorFactory : public AllocatorFactory { + public: + Allocator* CreateAllocator() { return new BFCRdmaAllocator; } + + SubAllocator* CreateSubAllocator(int numa_node) { + return new BasicCPUAllocator(numa_node); + } +}; + +REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocatorFactory); + void RdmaMgr::InitAllocators() { - static std::once_flag flag; - std::call_once( - flag, [this]() { RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_; }); -} + RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_; -/*static*/ void RdmaMgr::RegMemVisitors() { - SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node, - size_t num_bytes) { - RdmaMemoryMgr::Singleton().InsertMemoryRegion( - ptr, num_bytes, strings::StrCat("CPU:", numa_node)); - }; - SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node, - size_t num_bytes) { - RdmaMemoryMgr::Singleton().EvictMemoryRegion(ptr, num_bytes); + Allocator* allocators[] = { +#if GOOGLE_CUDA + GPUProcessState::singleton()->GetCUDAHostAllocator(0), +#endif // GOOGLE_CUDA + ProcessState::singleton()->GetCPUAllocator(0), + cpu_allocator(), }; - ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor); - ProcessState::singleton()->AddCPUFreeVisitor(free_visitor); + using namespace std::placeholders; + + std::set<Allocator*> instrumented_; + + // Host memory allocators + for (Allocator* allocator : allocators) { + VisitableAllocator::Visitor alloc_visitor = + std::bind(&RdmaMemoryMgr::InsertMemoryRegion, + &RdmaMemoryMgr::Singleton(), _1, _2, allocator->Name()); + VisitableAllocator::Visitor free_visitor = std::bind( + &RdmaMemoryMgr::EvictMemoryRegion, &RdmaMemoryMgr::Singleton(), _1, _2); + + auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator); + CHECK(visitable_allocator) + << "is not visitable for instrumentation" << allocator->Name(); + // Make sure we don't instrument the same allocator twice + if (instrumented_.find(allocator) == std::end(instrumented_)) { + visitable_allocator->AddAllocVisitor(alloc_visitor); + visitable_allocator->AddFreeVisitor(free_visitor); + instrumented_.insert(allocator); + LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name(); + } + } #if GOOGLE_CUDA if (IsGDRAvailable()) { // Note we don't free allocated GPU memory so there is no free visitor int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1; - SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id, - size_t num_bytes) { - RdmaMemoryMgr::Singleton().InsertMemoryRegion( - ptr, num_bytes, strings::StrCat("GPU:", gpu_id)); - }; + char buf[8]; + sprintf(buf, "gpu"); + VisitableAllocator::Visitor cuda_alloc_visitor = + std::bind(&RdmaMemoryMgr::InsertMemoryRegion, + &RdmaMemoryMgr::Singleton(), _1, _2, std::string(buf)); + GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id, cuda_alloc_visitor); - GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id, - alloc_visitor); - GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor); LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id; } #endif // GOOGLE_CUDA diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h index 74b92cc9a6..9fffc335bb 100644 --- a/tensorflow/contrib/verbs/rdma_mgr.h +++ b/tensorflow/contrib/verbs/rdma_mgr.h @@ -39,7 +39,6 @@ class RdmaMgr { void SetupChannels(); bool ConnectivityCheck(); void InitAllocators(); - static void RegMemVisitors(); const string& local_worker() { return local_worker_; } private: diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc index 61469686e4..1a0b5028fe 100644 --- a/tensorflow/contrib/verbs/verbs_server_lib.cc +++ b/tensorflow/contrib/verbs/verbs_server_lib.cc @@ -76,13 +76,8 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def, return Status::OK(); } -namespace { -std::once_call reg_mem_visitors_call; -} // namespace - Status VerbsServer::Init(ServiceInitFunction service_func, RendezvousMgrCreationFunction rendezvous_mgr_func) { - std::call_once(reg_mem_visitors_call, []() { RdmaMgr::RegMemVisitors(); }); Status s = GrpcServer::Init(service_func, rendezvous_mgr_func); { mutex_lock l(mu_); diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 9bcf5b0865..d55bd8d7ed 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2783,6 +2783,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [ "common_runtime/step_stats_collector.h", "common_runtime/threadpool_device.h", "common_runtime/tracing_device.h", + "common_runtime/visitable_allocator.h", "common_runtime/process_state.h", "common_runtime/pool_allocator.h", "graph/gradients.h", diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index 3843ea9e60..84c6285bbe 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -31,7 +31,7 @@ namespace tensorflow { BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, bool allow_growth, const string& name) - : sub_allocator_(sub_allocator), + : suballocator_(sub_allocator), name_(name), free_chunks_list_(kInvalidChunkHandle), next_allocation_id_(1) { @@ -72,7 +72,7 @@ BFCAllocator::~BFCAllocator() { VLOG(2) << "Number of regions allocated: " << region_manager_.regions().size(); for (const auto& region : region_manager_.regions()) { - sub_allocator_->Free(region.ptr(), region.memory_size()); + suballocator_->Free(region.ptr(), region.memory_size()); } for (BinNum b = 0; b < kNumBins; b++) { @@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { // Try allocating. size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes); - void* mem_addr = sub_allocator_->Alloc(alignment, bytes); + void* mem_addr = suballocator_->Alloc(alignment, bytes); if (mem_addr == nullptr && !started_backpedal_) { // Only backpedal once. started_backpedal_ = true; @@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { while (mem_addr == nullptr) { bytes = RoundedBytes(bytes * kBackpedalFactor); if (bytes < rounded_bytes) break; - mem_addr = sub_allocator_->Alloc(alignment, bytes); + mem_addr = suballocator_->Alloc(alignment, bytes); } } @@ -158,6 +158,10 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { // Insert the chunk into the right bin. InsertFreeChunkIntoBin(h); + // Invoke visitors on newly allocated region. + for (const auto& visitor : region_visitors_) { + visitor(mem_addr, bytes); + } return true; } @@ -486,6 +490,15 @@ void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) { InsertFreeChunkIntoBin(coalesced_chunk); } +void BFCAllocator::AddAllocVisitor(Visitor visitor) { + VLOG(1) << "AddVisitor"; + mutex_lock l(lock_); + region_visitors_.push_back(visitor); + for (const auto& region : region_manager_.regions()) { + visitor(region.ptr(), region.memory_size()); + } +} + bool BFCAllocator::TracksAllocationSizes() { return true; } size_t BFCAllocator::RequestedSize(const void* ptr) { diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index 364071e066..20e1dab1d5 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -23,7 +23,7 @@ limitations under the License. #include <vector> #include "tensorflow/core/common_runtime/allocator_retry.h" -#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/lib/gtl/stl_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/macros.h" @@ -42,7 +42,7 @@ namespace tensorflow { // coalescing. One assumption we make is that the process using this // allocator owns pretty much all of the memory, and that nearly // all requests to allocate memory go through this interface. -class BFCAllocator : public Allocator { +class BFCAllocator : public VisitableAllocator { public: // Takes ownership of sub_allocator. BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, @@ -55,6 +55,11 @@ class BFCAllocator : public Allocator { const AllocationAttributes& allocation_attr) override; void DeallocateRaw(void* ptr) override; + void AddAllocVisitor(Visitor visitor) override; + + // Does nothing, because memory is never freed. + void AddFreeVisitor(Visitor visitor) override {} + bool TracksAllocationSizes() override; size_t RequestedSize(const void* ptr) override; @@ -418,7 +423,7 @@ class BFCAllocator : public Allocator { // of the available memory. bool started_backpedal_ = false; - std::unique_ptr<SubAllocator> sub_allocator_; + std::unique_ptr<SubAllocator> suballocator_; string name_; // Structures mutable after construction @@ -430,6 +435,9 @@ class BFCAllocator : public Allocator { // Pointer to head of linked list of free Chunks ChunkHandle free_chunks_list_ GUARDED_BY(lock_); + // Called once on each region, ASAP. + std::vector<Visitor> region_visitors_ GUARDED_BY(lock_); + // Counter containing the next unique identifier to assign to a // newly-created chunk. int64 next_allocation_id_ GUARDED_BY(lock_); diff --git a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h index 6bd29ef775..636cd43575 100644 --- a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h +++ b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h @@ -26,12 +26,8 @@ namespace tensorflow { class CUDAHostAllocator : public SubAllocator { public: // Note: stream_exec cannot be null. - explicit CUDAHostAllocator(se::StreamExecutor* stream_exec, int numa_node, - const std::vector<Visitor>& alloc_visitors, - const std::vector<Visitor>& free_visitors) - : SubAllocator(alloc_visitors, free_visitors), - stream_exec_(stream_exec), - numa_node_(numa_node) { + explicit CUDAHostAllocator(se::StreamExecutor* stream_exec) + : stream_exec_(stream_exec) { CHECK(stream_exec_ != nullptr); } ~CUDAHostAllocator() override {} @@ -43,23 +39,19 @@ class CUDAHostAllocator : public SubAllocator { if (ptr == nullptr) { LOG(WARNING) << "could not allocate pinned host memory of size: " << num_bytes; - return ptr; } - VisitAlloc(ptr, numa_node_, num_bytes); } return ptr; } void Free(void* ptr, size_t num_bytes) override { if (ptr != nullptr) { - VisitFree(ptr, numa_node_, num_bytes); stream_exec_->HostMemoryDeallocate(ptr); } } private: se::StreamExecutor* stream_exec_; // not owned, non-null - const int numa_node_; TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator); }; diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc index 44ffce77a1..2d4c8d0201 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc @@ -22,15 +22,18 @@ limitations under the License. namespace tensorflow { -GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator, - size_t total_memory, const string& name) - : GPUBFCAllocator(sub_allocator, total_memory, GPUOptions(), name) {} +GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, + const string& name) + : GPUBFCAllocator(cuda_gpu_id, total_memory, GPUOptions(), name) {} -GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator, - size_t total_memory, +GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, const GPUOptions& gpu_options, const string& name) - : BFCAllocator(sub_allocator, total_memory, gpu_options.allow_growth(), - name) {} + : BFCAllocator( + new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), + gpu_options.per_process_gpu_memory_fraction() > 1.0 || + gpu_options.experimental().use_unified_memory()), + total_memory, gpu_options.allow_growth(), name) {} } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h index 6b6de80734..f1cc2eace1 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h @@ -31,20 +31,28 @@ limitations under the License. namespace tensorflow { -// Suballocator for GPU memory. -class GPUMemAllocator : public SubAllocator { +// A GPU memory allocator that implements a 'best-fit with coalescing' +// algorithm. +class GPUBFCAllocator : public BFCAllocator { public: // 'cuda_gpu_id' refers to the ID of the GPU device within // the process and must reference a valid ID in the process. + GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, + const string& name); + GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, + const GPUOptions& gpu_options, const string& name); + virtual ~GPUBFCAllocator() {} + + TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator); +}; + +// Suballocator for GPU memory. +class GPUMemAllocator : public SubAllocator { + public: // Note: stream_exec cannot be null. - explicit GPUMemAllocator(se::StreamExecutor* stream_exec, CudaGpuId gpu_id, - bool use_unified_memory, - const std::vector<Visitor>& alloc_visitors, - const std::vector<Visitor>& free_visitors) - : SubAllocator(alloc_visitors, free_visitors), - stream_exec_(stream_exec), - gpu_id_(gpu_id), - use_unified_memory_(use_unified_memory) { + explicit GPUMemAllocator(se::StreamExecutor* stream_exec, + bool use_unified_memory) + : stream_exec_(stream_exec), use_unified_memory_(use_unified_memory) { CHECK(stream_exec_ != nullptr); } ~GPUMemAllocator() override {} @@ -57,14 +65,12 @@ class GPUMemAllocator : public SubAllocator { } else { ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque(); } - VisitAlloc(ptr, gpu_id_.value(), num_bytes); } return ptr; } void Free(void* ptr, size_t num_bytes) override { if (ptr != nullptr) { - VisitFree(ptr, gpu_id_.value(), num_bytes); if (use_unified_memory_) { stream_exec_->UnifiedMemoryDeallocate(ptr); } else { @@ -76,25 +82,11 @@ class GPUMemAllocator : public SubAllocator { private: se::StreamExecutor* stream_exec_; // not owned, non-null - const CudaGpuId gpu_id_; const bool use_unified_memory_ = false; TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator); }; -// A GPU memory allocator that implements a 'best-fit with coalescing' -// algorithm. -class GPUBFCAllocator : public BFCAllocator { - public: - GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory, - const string& name); - GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory, - const GPUOptions& gpu_options, const string& name); - ~GPUBFCAllocator() override {} - - TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator); -}; - } // namespace tensorflow #endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc index 7112c3afd4..67caeb3495 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc @@ -21,7 +21,6 @@ limitations under the License. #include <vector> #include "tensorflow/core/common_runtime/gpu/gpu_id.h" -#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h" #include "tensorflow/core/common_runtime/gpu/gpu_init.h" #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/gtl/inlined_vector.h" @@ -47,11 +46,7 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use, } TEST(GPUBFCAllocatorTest, NoDups) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); // Allocate a lot of raw pointers @@ -80,11 +75,7 @@ TEST(GPUBFCAllocatorTest, NoDups) { } TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); // Allocate 256 raw pointers of sizes between 100 bytes and about // a meg random::PhiloxRandom philox(123, 17); @@ -142,11 +133,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { } TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); float* first_ptr = a.Allocate<float>(1024); @@ -181,30 +168,18 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { } TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); float* ptr = a.Allocate<float>(0); EXPECT_EQ(nullptr, ptr); } TEST(GPUBFCAllocatorTest, TracksSizes) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); EXPECT_EQ(true, a.TracksAllocationSizes()); } TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); float* t1 = a.Allocate<float>(1); EXPECT_EQ(4, a.RequestedSize(t1)); EXPECT_EQ(256, a.AllocatedSize(t1)); @@ -212,12 +187,8 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { } TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); // Configure a 1MiB byte limit - GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc"); float* first_ptr = a.Allocate<float>(1 << 6); float* second_ptr = a.Allocate<float>(1 << 20); @@ -232,11 +203,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { options.set_allow_growth(true); // Max of 2GiB, but starts out small. - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc"); // Allocate 10 raw pointers of sizes between 100 bytes and about // 64 megs. @@ -297,15 +264,8 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { } TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc"); - sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); + GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); void* amem = a.AllocateRaw(1, 1); void* bmem = b.AllocateRaw(1, 1 << 30); a.DeallocateRaw(amem); @@ -313,11 +273,7 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { } static void BM_Allocation(int iters) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<size_t> sizes = {256, 4096, 16384, 524288, 512, 1048576, 10485760, 104857600, @@ -333,11 +289,7 @@ static void BM_Allocation(int iters) { BENCHMARK(BM_Allocation); static void BM_AllocationThreaded(int iters, int num_threads) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); thread::ThreadPool pool(Env::Default(), "test", num_threads); std::atomic_int_fast32_t count(iters); mutex done_lock; @@ -373,11 +325,7 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16); // A more complex benchmark that defers deallocation of an object for // "delay" allocations. static void BM_AllocationDelayed(int iters, int delay) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024}; int size_index = 0; @@ -415,11 +363,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { // only methods inside this class can access private members of BFCAllocator. void TestBinDebugInfo() { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); std::vector<void*> initial_ptrs; std::vector<size_t> initial_ptrs_allocated_sizes; @@ -497,11 +441,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { } void TestLog2FloorNonZeroSlow() { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc"); EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0)); EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1)); EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2)); diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc index 8e14f1ea75..934a57a5fb 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc @@ -27,7 +27,7 @@ limitations under the License. namespace tensorflow { -GPUcudaMallocAllocator::GPUcudaMallocAllocator(Allocator* allocator, +GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator, CudaGpuId cuda_gpu_id) : base_allocator_(allocator) { stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -60,6 +60,14 @@ void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) { #endif // GOOGLE_CUDA } +void GPUcudaMallocAllocator::AddAllocVisitor(Visitor visitor) { + return base_allocator_->AddAllocVisitor(visitor); +} + +void GPUcudaMallocAllocator::AddFreeVisitor(Visitor visitor) { + return base_allocator_->AddFreeVisitor(visitor); +} + bool GPUcudaMallocAllocator::TracksAllocationSizes() { return false; } } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h index 3d1d0ef481..856fdc34b4 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h @@ -19,7 +19,7 @@ limitations under the License. #include <memory> #include "tensorflow/core/common_runtime/gpu/gpu_id.h" -#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/types.h" @@ -29,17 +29,20 @@ namespace tensorflow { // An allocator that wraps a GPU allocator and adds debugging // functionality that verifies that users do not write outside their // allocated memory. -class GPUcudaMallocAllocator : public Allocator { +class GPUcudaMallocAllocator : public VisitableAllocator { public: - explicit GPUcudaMallocAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); + explicit GPUcudaMallocAllocator(VisitableAllocator* allocator, + CudaGpuId cuda_gpu_id); ~GPUcudaMallocAllocator() override; string Name() override { return "gpu_debug"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; void DeallocateRaw(void* ptr) override; + void AddAllocVisitor(Visitor visitor) override; + void AddFreeVisitor(Visitor visitor) override; bool TracksAllocationSizes() override; private: - Allocator* base_allocator_ = nullptr; // owned + VisitableAllocator* base_allocator_ = nullptr; // owned se::StreamExecutor* stream_exec_; // Not owned. diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc index 6bad66dcec..e4c834b30d 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc @@ -73,7 +73,7 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) { // ----------------------------------------------------------------------------- // GPUDebugAllocator // ----------------------------------------------------------------------------- -GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator, +GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator, CudaGpuId cuda_gpu_id) : base_allocator_(allocator) { stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -111,6 +111,14 @@ void GPUDebugAllocator::DeallocateRaw(void* ptr) { base_allocator_->DeallocateRaw(ptr); } +void GPUDebugAllocator::AddAllocVisitor(Visitor visitor) { + return base_allocator_->AddAllocVisitor(visitor); +} + +void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) { + return base_allocator_->AddFreeVisitor(visitor); +} + bool GPUDebugAllocator::TracksAllocationSizes() { return true; } size_t GPUDebugAllocator::RequestedSize(const void* ptr) { @@ -150,7 +158,7 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) { // ----------------------------------------------------------------------------- // GPUNanResetAllocator // ----------------------------------------------------------------------------- -GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator, +GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator, CudaGpuId cuda_gpu_id) : base_allocator_(allocator) { stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -192,6 +200,14 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) { base_allocator_->DeallocateRaw(ptr); } +void GPUNanResetAllocator::AddAllocVisitor(Visitor visitor) { + return base_allocator_->AddAllocVisitor(visitor); +} + +void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) { + return base_allocator_->AddFreeVisitor(visitor); +} + size_t GPUNanResetAllocator::RequestedSize(const void* ptr) { return base_allocator_->RequestedSize(ptr); } diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h index 0f27ff4384..0f9b72040c 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h @@ -21,7 +21,7 @@ limitations under the License. #include <unordered_map> #include "tensorflow/core/common_runtime/gpu/gpu_id.h" -#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/types.h" @@ -31,13 +31,16 @@ namespace tensorflow { // An allocator that wraps a GPU allocator and adds debugging // functionality that verifies that users do not write outside their // allocated memory. -class GPUDebugAllocator : public Allocator { +class GPUDebugAllocator : public VisitableAllocator { public: - explicit GPUDebugAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); + explicit GPUDebugAllocator(VisitableAllocator* allocator, + CudaGpuId cuda_gpu_id); ~GPUDebugAllocator() override; string Name() override { return "gpu_debug"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; void DeallocateRaw(void* ptr) override; + void AddAllocVisitor(Visitor visitor) override; + void AddFreeVisitor(Visitor visitor) override; bool TracksAllocationSizes() override; size_t RequestedSize(const void* ptr) override; size_t AllocatedSize(const void* ptr) override; @@ -50,7 +53,7 @@ class GPUDebugAllocator : public Allocator { bool CheckFooter(void* ptr); private: - Allocator* base_allocator_ = nullptr; // owned + VisitableAllocator* base_allocator_ = nullptr; // owned se::StreamExecutor* stream_exec_; // Not owned. @@ -60,20 +63,23 @@ class GPUDebugAllocator : public Allocator { // An allocator that wraps a GPU allocator and resets the memory on // allocation and free to 'NaN', helping to identify cases where the // user forgets to initialize the memory. -class GPUNanResetAllocator : public Allocator { +class GPUNanResetAllocator : public VisitableAllocator { public: - explicit GPUNanResetAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); + explicit GPUNanResetAllocator(VisitableAllocator* allocator, + CudaGpuId cuda_gpu_id); ~GPUNanResetAllocator() override; string Name() override { return "gpu_nan_reset"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; void DeallocateRaw(void* ptr) override; + void AddAllocVisitor(Visitor visitor) override; + void AddFreeVisitor(Visitor visitor) override; size_t RequestedSize(const void* ptr) override; size_t AllocatedSize(const void* ptr) override; void GetStats(AllocatorStats* stats) override; void ClearStats() override; private: - Allocator* base_allocator_ = nullptr; // owned + VisitableAllocator* base_allocator_ = nullptr; // owned se::StreamExecutor* stream_exec_; // Not owned. diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc index 98283cd846..236a0afa0b 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc @@ -35,10 +35,7 @@ namespace { TEST(GPUDebugAllocatorTest, OverwriteDetection_None) { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -62,10 +59,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) { EXPECT_DEATH( { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), - cuda_gpu_id, false /*use_unified_memory*/, {}, {}); - GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -98,10 +92,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) { EXPECT_DEATH( { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), - cuda_gpu_id, false /*use_unified_memory*/, {}, {}); - GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -131,10 +122,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) { TEST(GPUDebugAllocatorTest, ResetToNan) { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + GPUNanResetAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -175,11 +163,8 @@ TEST(GPUDebugAllocatorTest, ResetToNan) { TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { const CudaGpuId cuda_gpu_id(0); // NaN reset must be the outer-most allocator. - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); GPUNanResetAllocator a( - new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -220,21 +205,15 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { TEST(GPUDebugAllocatorTest, TracksSizes) { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id); EXPECT_EQ(true, a.TracksAllocationSizes()); } TEST(GPUDebugAllocatorTest, AllocatedVsRequested) { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); GPUNanResetAllocator a( - new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id), cuda_gpu_id); float* t1 = a.Allocate<float>(1); diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index 50e61b7e00..2763ac0d4a 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -41,6 +41,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/gpu/gpu_util.h" #include "tensorflow/core/common_runtime/gpu_device_context.h" #include "tensorflow/core/common_runtime/local_device.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/framework/op_kernel.h" @@ -284,38 +285,6 @@ BaseGPUDevice::~BaseGPUDevice() { for (auto ctx : device_contexts_) ctx->Unref(); } -// This should be idempotent if already initialized. -Status BaseGPUDevice::InitScratchBuffers() { - mutex_lock l(scratch_init_mutex_); - if (scratch_.size() < max_streams_) { - for (int i = 0; i < max_streams_; i++) { - DCHECK(streams_[i]); - if (scratch_.size() > i && scratch_[i]) continue; - size_t scratch_buffer_size = - Eigen::kCudaScratchSize + sizeof(unsigned int); - void* scratch_buffer = gpu_allocator_->AllocateRaw( - Allocator::kAllocatorAlignment, scratch_buffer_size); - if (scratch_buffer == nullptr) { - return errors::FailedPrecondition( - "Failed to allocate scratch buffer for device ", - tf_gpu_id_.value()); - } - se::DeviceMemory<char> mem( - se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size)); - - bool ok = executor_->SynchronousMemZero( - &mem, Eigen::kCudaScratchSize + sizeof(unsigned int)); - if (!ok) { - return errors::FailedPrecondition( - "Failed to memcopy into scratch buffer for device ", - tf_gpu_id_.value()); - } - scratch_.push_back(static_cast<char*>(scratch_buffer)); - } - } - return Status::OK(); -} - Status BaseGPUDevice::Init(const SessionOptions& options) { auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_); if (!executor_status.status().ok()) { @@ -334,6 +303,27 @@ Status BaseGPUDevice::Init(const SessionOptions& options) { for (int i = 0; i < max_streams_; i++) { streams_.push_back(StreamGroupFactory::Global().GetOrCreate( tf_gpu_id_, i, executor_, options.config.gpu_options())); + + size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int); + void* scratch_buffer = gpu_allocator_->AllocateRaw( + Allocator::kAllocatorAlignment, scratch_buffer_size); + if (scratch_buffer == nullptr) { + return errors::FailedPrecondition( + "Failed to allocate scratch buffer for device ", tf_gpu_id_.value()); + } + scratch_.push_back(static_cast<char*>(scratch_buffer)); + + se::DeviceMemory<char> mem( + se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size)); + + bool ok = executor_->SynchronousMemZero( + &mem, Eigen::kCudaScratchSize + sizeof(unsigned int)); + if (!ok) { + return errors::FailedPrecondition( + "Failed to memcopy into scratch buffer for device ", + tf_gpu_id_.value()); + } + device_contexts_.push_back(new GPUDeviceContext( i, streams_.back()->compute, streams_.back()->host_to_device, streams_.back()->device_to_host, streams_.back()->device_to_device)); @@ -877,11 +867,10 @@ PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() { return new ConcretePerOpGpuDevice(); } -Status BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, - PerOpGpuDevice* device, - DeviceContext* dc, - Allocator* allocator) { - TF_RETURN_IF_ERROR(InitScratchBuffers()); +void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, + PerOpGpuDevice* device, + DeviceContext* dc, + Allocator* allocator) { if (dc) { const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc); const int stream_id = gpu_dc->stream_id(); @@ -892,7 +881,6 @@ Status BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, } else { ReinitializeDevice(context, device, 0, allocator); } - return Status::OK(); } Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr, diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h index b3eea55758..56d03d7a8c 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.h +++ b/tensorflow/core/common_runtime/gpu/gpu_device.h @@ -86,9 +86,8 @@ class BaseGPUDevice : public LocalDevice { // The caller owns the returned device. PerOpGpuDevice* MakeGpuDevice() override; - Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, - DeviceContext* dc, - Allocator* allocator) override; + void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, + DeviceContext* dc, Allocator* allocator) override; // Returns the CUDA GPU id of this device within the native driver system; // e.g., for CUDA this is the ordinal of the GPU within the system. @@ -126,7 +125,6 @@ class BaseGPUDevice : public LocalDevice { class StreamGroupFactory; gtl::InlinedVector<StreamGroup*, 4> streams_; - mutex scratch_init_mutex_; gtl::InlinedVector<char*, 4> scratch_; std::vector<GPUDeviceContext*> device_contexts_; GpuDeviceInfo* gpu_device_info_ = nullptr; @@ -137,9 +135,6 @@ class BaseGPUDevice : public LocalDevice { std::unique_ptr<EventMgr> em_; std::unique_ptr<thread::ThreadPool> thread_pool_; - // Initialize scractch buffers used by Eigen. - Status InitScratchBuffers(); - void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device, int stream_id, Allocator* allocator); diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc index 9ec740fabe..b18688174d 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc @@ -76,16 +76,12 @@ GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) { // This function is defined for debugging problems with the allocators. GPUProcessState::~GPUProcessState() { CHECK_EQ(this, instance_); + for (auto p : gpu_allocators_) { + delete p; + } instance_ = nullptr; } -int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) { - // Return the NUMA node associated with the GPU's StreamExecutor. - se::StreamExecutor* se = - GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie(); - return se->GetDeviceDescription().numa_node(); -} - Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, TfGpuId tf_gpu_id, size_t total_bytes) { @@ -97,10 +93,13 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) { gpu_allocators_.resize(tf_gpu_id.value() + 1); + if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) + gpu_al_.resize(tf_gpu_id.value() + 1); } - AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()]; - if (allocator_parts.allocator.get() == nullptr) { + if (gpu_allocators_[tf_gpu_id.value()] == nullptr) { + VisitableAllocator* gpu_allocator; + // Validate allocator types. if (!allocator_type.empty() && allocator_type != "BFC") { LOG(ERROR) << "Invalid allocator type: " << allocator_type; @@ -109,17 +108,8 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, CudaGpuId cuda_gpu_id; TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); - int bus_id = BusIdForGPU(tf_gpu_id); - while (bus_id >= gpu_visitors_.size()) { - gpu_visitors_.push_back({}); - } - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - (options.per_process_gpu_memory_fraction() > 1.0 || - options.experimental().use_unified_memory()), - gpu_visitors_[bus_id], {}); - Allocator* gpu_allocator = - new GPUBFCAllocator(sub_allocator, total_bytes, options, + gpu_allocator = + new GPUBFCAllocator(cuda_gpu_id, total_bytes, options, strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc")); // If true, checks for memory overwrites by writing @@ -133,25 +123,34 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, // **WARNING** probably will not work in a multi-gpu scenario gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, cuda_gpu_id); } - - Allocator* recording_allocator = nullptr; + gpu_allocators_[tf_gpu_id.value()] = gpu_allocator; + + // If there are any pending AllocVisitors for this bus, add + // them now. + se::StreamExecutor* se = + GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie(); + int bus_id = se->GetDeviceDescription().numa_node(); + if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) { + for (const auto& v : gpu_visitors_[bus_id]) { + gpu_allocator->AddAllocVisitor(v); + } + } if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { ProcessState::MemDesc md; md.loc = ProcessState::MemDesc::GPU; md.dev_index = cuda_gpu_id.value(); md.gpu_registered = false; md.nic_registered = true; - recording_allocator = new internal::RecordingAllocator( + if (static_cast<int64>(gpu_al_.size()) <= tf_gpu_id.value()) { + gpu_al_.resize(tf_gpu_id.value() + 1); + } + gpu_al_[tf_gpu_id.value()] = new internal::RecordingAllocator( &process_state_->mem_desc_map_, gpu_allocator, md, &mu_); } - allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator), sub_allocator, - std::unique_ptr<Allocator>(recording_allocator)}; - } - if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { - return allocator_parts.recording_allocator.get(); - } else { - return allocator_parts.allocator.get(); } + if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) + return gpu_al_[tf_gpu_id.value()]; + return gpu_allocators_[tf_gpu_id.value()]; #else LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda."; return nullptr; @@ -173,12 +172,11 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { tf_shared_lock lock(mu_); if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types && - !cuda_host_allocators_.empty() && - cuda_host_allocators_[0].recording_allocator != nullptr) { - return cuda_host_allocators_[0].recording_allocator.get(); + static_cast<int>(cuda_al_.size()) > 0) { + return cuda_al_[0]; } if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) { - return cuda_host_allocators_[0].allocator.get(); + return cuda_host_allocators_[0]; } } @@ -192,7 +190,7 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { // it knows is valid. se::StreamExecutor* se = nullptr; for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) { - if (gpu_allocators_[i].allocator != nullptr) { + if (gpu_allocators_[i] != nullptr) { se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie(); break; } @@ -201,15 +199,6 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { CHECK_NE(nullptr, se); while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) { - while (cuda_host_alloc_visitors_.size() <= numa_node) { - cuda_host_alloc_visitors_.push_back({}); - } - while (cuda_host_free_visitors_.size() <= numa_node) { - cuda_host_free_visitors_.push_back({}); - } - SubAllocator* sub_allocator = new CUDAHostAllocator( - se, numa_node, cuda_host_alloc_visitors_[numa_node], - cuda_host_free_visitors_[numa_node]); // TODO(zheng-xq): evaluate whether 64GB by default is the best choice. int64 cuda_host_mem_limit_in_mb = -1; Status status = ReadInt64FromEnvVar("TF_CUDA_HOST_MEM_LIMIT_IN_MB", @@ -219,92 +208,62 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { LOG(ERROR) << "GetCUDAHostAllocator: " << status.error_message(); } int64 cuda_host_mem_limit = cuda_host_mem_limit_in_mb * (1LL << 20); - Allocator* allocator = - new BFCAllocator(sub_allocator, cuda_host_mem_limit, + VisitableAllocator* allocator = + new BFCAllocator(new CUDAHostAllocator(se), cuda_host_mem_limit, true /*allow_growth*/, "cuda_host_bfc" /*name*/); - if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) { + if (LogMemory::IsEnabled()) { // Wrap the allocator to track allocation ids for better logging // at the cost of performance. - allocator = new TrackingAllocator(allocator, true); + allocator = new TrackingVisitableAllocator(allocator, true); } - cuda_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator), - sub_allocator, - std::unique_ptr<Allocator>(nullptr)}); - AllocatorParts& allocator_parts = cuda_host_allocators_.back(); + cuda_host_allocators_.push_back(allocator); if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { ProcessState::MemDesc md; md.loc = ProcessState::MemDesc::CPU; md.dev_index = 0; md.gpu_registered = true; md.nic_registered = false; - allocator_parts.recording_allocator.reset( - new internal::RecordingAllocator(&process_state_->mem_desc_map_, - allocator_parts.allocator.get(), md, - &mu_)); + cuda_al_.push_back(new internal::RecordingAllocator( + &process_state_->mem_desc_map_, cuda_host_allocators_.back(), md, + &mu_)); } } - if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { - return cuda_host_allocators_[0].recording_allocator.get(); - } else { - return cuda_host_allocators_[0].allocator.get(); - } + if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) + return cuda_al_[0]; + return cuda_host_allocators_[0]; } void GPUProcessState::AddGPUAllocVisitor(int bus_id, - const SubAllocator::Visitor& visitor) { + const AllocVisitor& visitor) { + CHECK(process_state_); #if GOOGLE_CUDA mutex_lock lock(mu_); - CHECK(gpu_allocators_.empty()) // Crash OK - << "AddGPUAllocVisitor must be called before " - "first call to GetGPUAllocator."; + for (int i = 0; i < static_cast<int64>(gpu_allocators_.size()); ++i) { + se::StreamExecutor* se = + GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie(); + if (gpu_allocators_[i] && + (se->GetDeviceDescription().numa_node() + 1) == bus_id) { + gpu_allocators_[i]->AddAllocVisitor(visitor); + } + } while (bus_id >= static_cast<int64>(gpu_visitors_.size())) { - gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>()); + gpu_visitors_.push_back(std::vector<AllocVisitor>()); } gpu_visitors_[bus_id].push_back(visitor); #endif // GOOGLE_CUDA } -void GPUProcessState::AddCUDAHostAllocVisitor( - int numa_node, const SubAllocator::Visitor& visitor) { -#if GOOGLE_CUDA - mutex_lock lock(mu_); - CHECK(cuda_host_allocators_.empty()) // Crash OK - << "AddCUDAHostAllocVisitor must be called before " - "first call to GetCUDAHostAllocator."; - while (numa_node >= static_cast<int64>(cuda_host_alloc_visitors_.size())) { - cuda_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>()); - } - cuda_host_alloc_visitors_[numa_node].push_back(visitor); -#endif // GOOGLE_CUDA -} - -void GPUProcessState::AddCUDAHostFreeVisitor( - int numa_node, const SubAllocator::Visitor& visitor) { -#if GOOGLE_CUDA - mutex_lock lock(mu_); - CHECK(cuda_host_allocators_.empty()) // Crash OK - << "AddCUDAHostFreeVisitor must be called before " - "first call to GetCUDAHostAllocator."; - while (numa_node >= static_cast<int64>(cuda_host_free_visitors_.size())) { - cuda_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>()); - } - cuda_host_free_visitors_[numa_node].push_back(visitor); -#endif // GOOGLE_CUDA -} - void GPUProcessState::TestOnlyReset() { - if (process_state_) { - process_state_->ProcessState::TestOnlyReset(); - } + process_state_->ProcessState::TestOnlyReset(); { mutex_lock lock(mu_); gpu_device_enabled_ = false; - gpu_allocators_.clear(); gpu_visitors_.clear(); - cuda_host_allocators_.clear(); - cuda_host_alloc_visitors_.clear(); - cuda_host_free_visitors_.clear(); + gtl::STLDeleteElements(&gpu_allocators_); + gtl::STLDeleteElements(&cuda_host_allocators_); + gtl::STLDeleteElements(&gpu_al_); + gtl::STLDeleteElements(&cuda_al_); } } diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h index 43e9a31660..cb41c3c6bd 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h +++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h @@ -32,6 +32,7 @@ limitations under the License. namespace tensorflow { class Allocator; +class VisitableAllocator; class PoolAllocator; // Singleton that manages per-process state when GPUs are present. @@ -71,30 +72,18 @@ class GPUProcessState { virtual Allocator* GetCUDAHostAllocator(int numa_node); - // Registers a Visitor to be invoked on new chunks of memory allocated by the - // SubAllocator of every GPU proximate to the specified bus. The AllocVisitor - // is provided with a memory pointer, a GPU id, and the size of the area it - // identifies. The pointer is not guaranteed to be valid after the call - // terminates. The intention is for this interface to be used for network - // device memory registration. "bus_id" is platform-specific. On many - // platforms it should be 0. On machines with multiple PCIe buses, it should - // be the index of one of the PCIe buses (maybe the NUMA node at which the - // PCIe is rooted). If the bus_id is invalid, results are undefined. - virtual void AddGPUAllocVisitor(int bus_id, - const SubAllocator::Visitor& visitor); - - // Registers a Visitor to be invoked on new chunks of memory allocated by - // the SubAllocator of the CUDAHostAllocator for the given numa_node. - virtual void AddCUDAHostAllocVisitor(int numa_node, - const SubAllocator::Visitor& visitor); - - // Registers a Visitor to be invoked on each chunk handed back for freeing to - // the SubAllocator of the CUDAHostAllocator for the given numa_node. - virtual void AddCUDAHostFreeVisitor(int numa_node, - const SubAllocator::Visitor& visitor); - - // Returns bus_id for the given GPU id. - virtual int BusIdForGPU(TfGpuId tf_gpu_id); + // Registers a function to be called once on every new Region + // allocated by every GPURegionAllocator proximate to the specified + // bus. The AllocVisitor is provided with a memory pointer and the + // size of the area it identifies. The pointer is not guaranteed to + // be valid after the call terminates. The intention is for this + // interface to be used for network device memory registration. + // "bus_id" is platform-specific. On many platforms it + // should be 0. On machines with multiple PCIe buses, it should be + // the index of one of the PCIe buses. If the bus_id is invalid, + // results are undefined. + typedef std::function<void(void*, size_t)> AllocVisitor; + virtual void AddGPUAllocVisitor(int bus_id, const AllocVisitor& visitor); protected: GPUProcessState(); @@ -114,22 +103,17 @@ class GPUProcessState { mutex mu_; - struct AllocatorParts { - std::unique_ptr<Allocator> allocator; - SubAllocator* sub_allocator; // owned by allocator - std::unique_ptr<Allocator> recording_allocator; - }; - std::vector<AllocatorParts> gpu_allocators_ GUARDED_BY(mu_); - std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_ GUARDED_BY(mu_); - - std::vector<AllocatorParts> cuda_host_allocators_ GUARDED_BY(mu_); - std::vector<std::vector<SubAllocator::Visitor>> cuda_host_alloc_visitors_ - GUARDED_BY(mu_); - std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_ - GUARDED_BY(mu_); + std::vector<VisitableAllocator*> gpu_allocators_ GUARDED_BY(mu_); + std::vector<std::vector<AllocVisitor>> gpu_visitors_ GUARDED_BY(mu_); + std::vector<Allocator*> cuda_host_allocators_ GUARDED_BY(mu_); virtual ~GPUProcessState(); + // Optional RecordingAllocators that wrap the corresponding + // Allocators for runtime attribute use analysis. + std::vector<Allocator*> gpu_al_ GUARDED_BY(mu_); + std::vector<Allocator*> cuda_al_ GUARDED_BY(mu_); + friend class GPUDeviceTest; }; diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc index 6b2f6547b0..583bff2c07 100644 --- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc @@ -31,8 +31,7 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) { 2 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie(), - 0 /*numa_node*/, {}, {}), + .ValueOrDie()), new NoopRounder, "pool"); EXPECT_EQ(nullptr, pool.AllocateRaw(4 /*alignment*/, 0 /*num_bytes*/)); @@ -50,8 +49,7 @@ TEST(PoolAllocatorTest, ZeroSizePool) { 0 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie(), - 0 /*numa_node*/, {}, {}), + .ValueOrDie()), new NoopRounder, "pool"); EXPECT_EQ(0, pool.get_from_pool_count()); @@ -84,8 +82,7 @@ TEST(PoolAllocatorTest, Alignment) { 0 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie(), - 0 /*numa_node*/, {}, {}), + .ValueOrDie()), new NoopRounder, "pool"); for (int i = 0; i < 16; ++i) { size_t alignment = 1 << i; @@ -100,8 +97,8 @@ TEST(PoolAllocatorTest, Alignment) { TEST(PoolAllocatorTest, AutoResize) { PoolAllocator pool(2 /*pool_size_limit*/, true /*auto_resize*/, - new BasicCPUAllocator(0 /*numa_node*/, {}, {}), - new NoopRounder, "pool"); + new BasicCPUAllocator(0 /*numa_node*/), new NoopRounder, + "pool"); // Alloc/dealloc 10 sizes just a few times, confirming pool size // stays at 2. @@ -126,32 +123,14 @@ TEST(PoolAllocatorTest, AutoResize) { } TEST(PoolAllocatorTest, CudaHostAllocator) { - int alloc_count = 0; - int64 alloc_size = 0; - SubAllocator::Visitor alloc_visitor = - [&alloc_count, &alloc_size](void* ptr, int numa_node, int64 size) { - ++alloc_count; - alloc_size += size; - }; - int free_count = 0; - int64 free_size = 0; - SubAllocator::Visitor free_visitor = - [&free_count, &free_size](void* ptr, int numa_node, int64 size) { - ++free_count; - free_size += size; - }; se::Platform* platform = se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); - CUDAHostAllocator* sub_allocator = new CUDAHostAllocator( - platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie(), - 0 /*numa_node*/, {alloc_visitor}, {free_visitor}); - PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/, - sub_allocator, new NoopRounder, "pool"); - EXPECT_EQ(0, alloc_count); - EXPECT_EQ(0, alloc_size); - EXPECT_EQ(0, free_count); - EXPECT_EQ(0, free_size); + PoolAllocator pool( + 2 /*pool_size_limit*/, false /*auto_resize*/, + new CUDAHostAllocator( + platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) + .ValueOrDie()), + new NoopRounder, "pool"); // Repeatedly Get a 16-byte value, confirming that there's only // one real allocation. @@ -159,10 +138,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { EXPECT_EQ(0, pool.get_from_pool_count()); EXPECT_EQ(1, pool.allocated_count()); EXPECT_NE(nullptr, p1_16); - EXPECT_EQ(1, alloc_count); // Underlying suballoc of 16 bytes - // Each suballocation includes a 16B ChunkPrefix. - static const int kChunkPrefixSize = 16; - EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size); pool.DeallocateRaw(p1_16); // Pool contents {16} EXPECT_EQ(1, pool.put_count()); @@ -173,9 +148,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { pool.DeallocateRaw(p2_16); // Put it back. // Pool contents {16} EXPECT_EQ(2, pool.put_count()); - EXPECT_EQ(1, alloc_count); // Underlying suballoc of 16 bytes - EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size); - EXPECT_EQ(0, free_count); // Get two more values of different sizes. void* p3_4 = pool.AllocateRaw(4, 4); @@ -188,9 +160,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { void* p4_2 = pool.AllocateRaw(4, 2); // Get a third size buffer. EXPECT_NE(nullptr, p4_2); EXPECT_EQ(0, pool.evicted_count()); - EXPECT_EQ(3, alloc_count); - EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); - EXPECT_EQ(0, free_count); // The pool is full: when we put back p4_2, the 16-byte buffer // should be evicted since it was least recently inserted. @@ -198,10 +167,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { // Pool contents {2, 4} EXPECT_EQ(4, pool.put_count()); EXPECT_EQ(1, pool.evicted_count()); - EXPECT_EQ(3, alloc_count); - EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); - EXPECT_EQ(1, free_count); - EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size); // Re-getting and putting size 2 or 4 should not alter pool size or // num-evicted. @@ -215,20 +180,12 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { EXPECT_EQ(6, pool.put_count()); EXPECT_EQ(3, pool.allocated_count()); EXPECT_EQ(1, pool.evicted_count()); - EXPECT_EQ(3, alloc_count); - EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); - EXPECT_EQ(1, free_count); - EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size); pool.Clear(); EXPECT_EQ(0, pool.get_from_pool_count()); EXPECT_EQ(0, pool.put_count()); EXPECT_EQ(0, pool.allocated_count()); EXPECT_EQ(0, pool.evicted_count()); - EXPECT_EQ(3, alloc_count); - EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); - EXPECT_EQ(3, free_count); - EXPECT_EQ(16 + 4 + 2 + (free_count * kChunkPrefixSize), free_size); } TEST(PoolAllocatorTest, Pow2Rounder) { @@ -249,8 +206,7 @@ TEST(PoolAllocatorTest, Name) { 2 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie(), - 0 /*numa_node*/, {}, {}), + .ValueOrDie()), new NoopRounder, "pool"); EXPECT_EQ("pool", pool.Name()); } diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h index 538a70668a..df9c3a686c 100644 --- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h +++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h @@ -23,11 +23,12 @@ limitations under the License. #include <cstdlib> #include "tensorflow/core/common_runtime/bfc_allocator.h" -#include "tensorflow/core/common_runtime/pool_allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" +#include "tensorflow/core/framework/allocator_registry.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/mem.h" -#include "tensorflow/core/platform/numa.h" +#include "tensorflow/core/platform/mutex.h" #ifndef INTEL_MKL_DNN_ONLY #include "i_malloc.h" @@ -39,16 +40,20 @@ typedef unsigned int uint; namespace tensorflow { -class MklSubAllocator : public BasicCPUAllocator { +class MklSubAllocator : public SubAllocator { public: - MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {} ~MklSubAllocator() override {} + + void* Alloc(size_t alignment, size_t num_bytes) override { + return port::AlignedMalloc(num_bytes, alignment); + } + void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); } }; // CPU allocator that handles small-size allocations by calling // suballocator directly. Mostly, it is just a wrapper around a suballocator // (that calls malloc and free directly) with support for bookkeeping. -class MklSmallSizeAllocator : public Allocator { +class MklSmallSizeAllocator : public VisitableAllocator { public: MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory, const string& name) @@ -70,6 +75,10 @@ class MklSmallSizeAllocator : public Allocator { CHECK(map_.insert(map_val).second); // Increment statistics for small-size allocations. IncrementStats(num_bytes); + // Call alloc visitors. + for (const auto& visitor : alloc_visitors_) { + visitor(ptr, num_bytes); + } } return ptr; } @@ -85,6 +94,9 @@ class MklSmallSizeAllocator : public Allocator { if (map_iter != map_.end()) { // Call free visitors. size_t dealloc_bytes = map_iter->second; + for (const auto& visitor : free_visitors_) { + visitor(ptr, dealloc_bytes); + } sub_allocator_->Free(ptr, dealloc_bytes); DecrementStats(dealloc_bytes); map_.erase(map_iter); @@ -109,6 +121,16 @@ class MklSmallSizeAllocator : public Allocator { stats_.Clear(); } + void AddAllocVisitor(Visitor visitor) override { + mutex_lock l(mutex_); + alloc_visitors_.push_back(visitor); + } + + void AddFreeVisitor(Visitor visitor) override { + mutex_lock l(mutex_); + free_visitors_.push_back(visitor); + } + private: // Increment statistics for the allocator handling small allocations. inline void IncrementStats(size_t alloc_size) @@ -141,11 +163,15 @@ class MklSmallSizeAllocator : public Allocator { // Allocator stats for small allocs AllocatorStats stats_ GUARDED_BY(mutex_); + + // Visitors + std::vector<Visitor> alloc_visitors_ GUARDED_BY(mutex_); + std::vector<Visitor> free_visitors_ GUARDED_BY(mutex_); }; /// CPU allocator for MKL that wraps BFC allocator and intercepts /// and redirects memory allocation calls from MKL. -class MklCPUAllocator : public Allocator { +class MklCPUAllocator : public VisitableAllocator { public: // Constructor and other standard functions @@ -258,6 +284,16 @@ class MklCPUAllocator : public Allocator { large_size_allocator_->ClearStats(); } + void AddAllocVisitor(Visitor visitor) override { + small_size_allocator_->AddAllocVisitor(visitor); + large_size_allocator_->AddAllocVisitor(visitor); + } + + void AddFreeVisitor(Visitor visitor) override { + small_size_allocator_->AddFreeVisitor(visitor); + large_size_allocator_->AddFreeVisitor(visitor); + } + private: // Hooks provided by this allocator for memory allocation routines from MKL @@ -294,7 +330,7 @@ class MklCPUAllocator : public Allocator { // The alignment that we need for the allocations static constexpr const size_t kAlignment = 64; - Allocator* large_size_allocator_; // owned by this class + VisitableAllocator* large_size_allocator_; // owned by this class MklSmallSizeAllocator* small_size_allocator_; // owned by this class. SubAllocator* sub_allocator_; // not owned by this class diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc index 66dc8f3322..fdad8de8d6 100644 --- a/tensorflow/core/common_runtime/pool_allocator.cc +++ b/tensorflow/core/common_runtime/pool_allocator.cc @@ -40,7 +40,8 @@ PoolAllocator::PoolAllocator(size_t pool_size_limit, bool auto_resize, auto_resize_(auto_resize), pool_size_limit_(pool_size_limit), allocator_(allocator), - size_rounder_(size_rounder) { + size_rounder_(size_rounder), + allocation_begun_(false) { if (auto_resize) { CHECK_LT(size_t{0}, pool_size_limit) << "size limit must be > 0 if auto_resize is true."; @@ -92,6 +93,7 @@ ChunkPrefix* FindPrefix(void* user_ptr) { } // namespace void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { + if (!allocation_begun_) allocation_begun_ = true; if (num_bytes == 0) return nullptr; // If alignment is larger than kPoolAlignment, increase num_bytes so that we @@ -127,6 +129,9 @@ void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { return PrepareChunk(r, alignment, num_bytes); } else { void* ptr = allocator_->Alloc(kPoolAlignment, num_bytes); + for (const auto& v : alloc_visitors_) { + v(ptr, num_bytes); + } return PrepareChunk(ptr, alignment, num_bytes); } } @@ -136,6 +141,9 @@ void PoolAllocator::DeallocateRaw(void* ptr) { ChunkPrefix* cp = FindPrefix(ptr); CHECK_LE((void*)cp, (void*)ptr); if (!has_size_limit_ && !auto_resize_) { + for (const auto& v : free_visitors_) { + v(cp, cp->num_bytes); + } allocator_->Free(cp, cp->num_bytes); } else { mutex_lock lock(mutex_); @@ -156,6 +164,9 @@ void PoolAllocator::Clear() { mutex_lock lock(mutex_); for (auto iter : pool_) { PtrRecord* pr = iter.second; + for (const auto& v : free_visitors_) { + v(pr->ptr, pr->num_bytes); + } allocator_->Free(pr->ptr, pr->num_bytes); delete pr; } @@ -210,6 +221,9 @@ void PoolAllocator::EvictOne() { DCHECK(iter != pool_.end()); } pool_.erase(iter); + for (const auto& v : free_visitors_) { + v(prec->ptr, prec->num_bytes); + } allocator_->Free(prec->ptr, prec->num_bytes); delete prec; ++evicted_count_; @@ -255,19 +269,28 @@ void PoolAllocator::EvictOne() { } } +void PoolAllocator::AddAllocVisitor(Visitor visitor) { + mutex_lock lock(mutex_); + CHECK(!allocation_begun_) + << "AddAllocVisitor may not be called after pool allocation " + << "has begun."; + alloc_visitors_.push_back(visitor); +} + +void PoolAllocator::AddFreeVisitor(Visitor visitor) { + mutex_lock lock(mutex_); + CHECK(!allocation_begun_) + << "AddFreeVisitor may not be called after pool allocation " + << "has begun."; + free_visitors_.push_back(visitor); +} + void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) { - void* ptr = nullptr; - if (num_bytes > 0) { - ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment)); - VisitAlloc(ptr, numa_node_, num_bytes); - } - return ptr; + return port::AlignedMalloc(num_bytes, static_cast<int>(alignment)); } void BasicCPUAllocator::Free(void* ptr, size_t num_bytes) { - if (num_bytes > 0) { - VisitFree(ptr, numa_node_, num_bytes); - port::AlignedFree(ptr); - } + port::AlignedFree(ptr); } + } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h index 5b4623ba10..607734445b 100644 --- a/tensorflow/core/common_runtime/pool_allocator.h +++ b/tensorflow/core/common_runtime/pool_allocator.h @@ -16,13 +16,14 @@ limitations under the License. #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_ #define TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_ -// Simple LRU pool allocators for various flavors of CPU RAM. +// Simple LRU pool allocators for various flavors of CPU RAM that +// implement the VisitableAllocator interface. #include <atomic> #include <map> #include <memory> #include <vector> -#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/lib/core/bits.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" @@ -40,7 +41,7 @@ class RoundUpInterface { // Size-limited pool of memory buffers obtained from a SubAllocator // instance. Pool eviction policy is LRU. -class PoolAllocator : public Allocator { +class PoolAllocator : public VisitableAllocator { public: // "pool_size_limit" is the maximum number of returned, re-usable // memory buffers to keep in the pool. If pool_size_limit == 0, the @@ -63,6 +64,14 @@ class PoolAllocator : public Allocator { void DeallocateRaw(void* ptr) override; + // REQUIRES: The following functions may only be called prior + // to the first Allocate*() call. Once allocation has begun, it is + // illegal to register another visitor. + + void AddAllocVisitor(Visitor visitor) override; + + void AddFreeVisitor(Visitor visitor) override; + // Allocate an unused memory region of size "num_bytes". Fetch from // the pool if available, otherwise call allocator_. void* Get(size_t num_bytes); @@ -132,6 +141,12 @@ class PoolAllocator : public Allocator { int64 put_count_ GUARDED_BY(mutex_) = 0; int64 allocated_count_ GUARDED_BY(mutex_) = 0; int64 evicted_count_ GUARDED_BY(mutex_) = 0; + // Write access to these is guarded by mutex_, but not read + // access. They may only be modified prior to the first + // allocation. Later attempts to modify will fail. + std::vector<Visitor> alloc_visitors_; + std::vector<Visitor> free_visitors_; + std::atomic<bool> allocation_begun_; }; // Do-nothing rounder. Passes through sizes unchanged. @@ -151,9 +166,7 @@ class Pow2Rounder : public RoundUpInterface { class BasicCPUAllocator : public SubAllocator { public: // Argument numa_node is currently ignored. - BasicCPUAllocator(int numa_node, const std::vector<Visitor>& alloc_visitors, - const std::vector<Visitor>& free_visitors) - : SubAllocator(alloc_visitors, free_visitors), numa_node_(numa_node) {} + explicit BasicCPUAllocator(int numa_node) : numa_node_(numa_node) {} ~BasicCPUAllocator() override {} @@ -163,8 +176,6 @@ class BasicCPUAllocator : public SubAllocator { private: int numa_node_; - - TF_DISALLOW_COPY_AND_ASSIGN(BasicCPUAllocator); }; } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc index bcaa37fc8a..447338e7bd 100644 --- a/tensorflow/core/common_runtime/process_state.cc +++ b/tensorflow/core/common_runtime/process_state.cc @@ -71,28 +71,20 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) { return MemDesc(); } -Allocator* ProcessState::GetCPUAllocator(int numa_node) { +VisitableAllocator* ProcessState::GetCPUAllocator(int numa_node) { CHECK_GE(numa_node, 0); if (!numa_enabled_) numa_node = 0; mutex_lock lock(mu_); while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) { - // If visitors have been defined we need an Allocator built from - // a SubAllocator. Prefer BFCAllocator, but fall back to PoolAllocator - // depending on env var setting. - const bool alloc_visitors_defined = - (!cpu_alloc_visitors_.empty() || !cpu_free_visitors_.empty()); bool use_bfc_allocator = false; - Status status = ReadBoolFromEnvVar( - "TF_CPU_ALLOCATOR_USE_BFC", alloc_visitors_defined, &use_bfc_allocator); + // TODO(reedwm): Switch default to BGFAllocator if it's at least as fast and + // efficient. + Status status = ReadBoolFromEnvVar("TF_CPU_ALLOCATOR_USE_BFC", false, + &use_bfc_allocator); if (!status.ok()) { LOG(ERROR) << "GetCPUAllocator: " << status.error_message(); } - Allocator* allocator = nullptr; - SubAllocator* sub_allocator = - (alloc_visitors_defined || use_bfc_allocator) - ? new BasicCPUAllocator(numa_enabled_ ? numa_node : -1, - cpu_alloc_visitors_, cpu_free_visitors_) - : nullptr; + VisitableAllocator* allocator; if (use_bfc_allocator) { // TODO(reedwm): evaluate whether 64GB by default is the best choice. int64 cpu_mem_limit_in_mb = -1; @@ -103,63 +95,34 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) { LOG(ERROR) << "GetCPUAllocator: " << status.error_message(); } int64 cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20); - DCHECK(sub_allocator); - allocator = - new BFCAllocator(sub_allocator, cpu_mem_limit, true /*allow_growth*/, - "bfc_cpu_allocator_for_gpu" /*name*/); + allocator = new BFCAllocator( + new BasicCPUAllocator(numa_enabled_ ? numa_node : -1), cpu_mem_limit, + true /*allow_growth*/, "bfc_cpu_allocator_for_gpu" /*name*/); VLOG(2) << "Using BFCAllocator with memory limit of " << cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator"; - } else if (alloc_visitors_defined) { - DCHECK(sub_allocator); - allocator = - new PoolAllocator(100 /*pool_size_limit*/, true /*auto_resize*/, - sub_allocator, new NoopRounder, "cpu_pool"); + } else { + allocator = new PoolAllocator( + 100 /*pool_size_limit*/, true /*auto_resize*/, + new BasicCPUAllocator(numa_enabled_ ? numa_node : -1), + new NoopRounder, "cpu_pool"); VLOG(2) << "Using PoolAllocator for ProcessState CPU allocator " << "numa_enabled_=" << numa_enabled_ << " numa_node=" << numa_node; - } else { - DCHECK(!sub_allocator); - allocator = cpu_allocator(); } - if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) { + if (LogMemory::IsEnabled()) { // Wrap the allocator to track allocation ids for better logging // at the cost of performance. - allocator = new TrackingAllocator(allocator, true); + allocator = new TrackingVisitableAllocator(allocator, true); } cpu_allocators_.push_back(allocator); - if (!sub_allocator) { - DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty()); - } } return cpu_allocators_[numa_node]; } -void ProcessState::AddCPUAllocVisitor(SubAllocator::Visitor visitor) { - VLOG(1) << "AddCPUAllocVisitor"; - mutex_lock lock(mu_); - CHECK_EQ(0, cpu_allocators_.size()) // Crash OK - << "AddCPUAllocVisitor must be called prior to first call to " - "ProcessState::GetCPUAllocator"; - cpu_alloc_visitors_.push_back(std::move(visitor)); -} - -void ProcessState::AddCPUFreeVisitor(SubAllocator::Visitor visitor) { - mutex_lock lock(mu_); - CHECK_EQ(0, cpu_allocators_.size()) // Crash OK - << "AddCPUFreeVisitor must be called prior to first call to " - "ProcessState::GetCPUAllocator"; - cpu_free_visitors_.push_back(std::move(visitor)); -} - void ProcessState::TestOnlyReset() { mutex_lock lock(mu_); - // Don't delete this value because it's static. - Allocator* default_cpu_allocator = cpu_allocator(); mem_desc_map_.clear(); - for (Allocator* a : cpu_allocators_) { - if (a != default_cpu_allocator) delete a; - } - cpu_allocators_.clear(); + gtl::STLDeleteElements(&cpu_allocators_); gtl::STLDeleteElements(&cpu_al_); } diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h index cac312d849..2892677333 100644 --- a/tensorflow/core/common_runtime/process_state.h +++ b/tensorflow/core/common_runtime/process_state.h @@ -30,6 +30,7 @@ limitations under the License. namespace tensorflow { class Allocator; +class VisitableAllocator; class PoolAllocator; // Singleton that manages per-process state, e.g. allocation of @@ -64,15 +65,7 @@ class ProcessState { // Returns the one CPUAllocator used for the given numa_node. // TEMPORARY: ignores numa_node. - Allocator* GetCPUAllocator(int numa_node); - - // Registers alloc visitor for the CPU allocator(s). - // REQUIRES: must be called before GetCPUAllocator. - void AddCPUAllocVisitor(SubAllocator::Visitor v); - - // Registers free visitor for the CPU allocator(s). - // REQUIRES: must be called before GetCPUAllocator. - void AddCPUFreeVisitor(SubAllocator::Visitor v); + VisitableAllocator* GetCPUAllocator(int numa_node); typedef std::unordered_map<const void*, MemDesc> MDMap; @@ -94,9 +87,7 @@ class ProcessState { mutex mu_; - std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_); - std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_); - std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_); + std::vector<VisitableAllocator*> cpu_allocators_ GUARDED_BY(mu_); virtual ~ProcessState(); diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h index 9d59264899..103eee03b3 100644 --- a/tensorflow/core/common_runtime/renamed_device.h +++ b/tensorflow/core/common_runtime/renamed_device.h @@ -72,10 +72,9 @@ class RenamedDevice : public Device { return underlying_->MakeGpuDevice(); } - Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, - DeviceContext* dc, - Allocator* allocator) override { - return underlying_->ReinitializeGpuDevice(context, device, dc, allocator); + void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, + DeviceContext* dc, Allocator* allocator) override { + underlying_->ReinitializeGpuDevice(context, device, dc, allocator); } Status MakeTensorFromProto(const TensorProto& tensor_proto, diff --git a/tensorflow/core/common_runtime/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h new file mode 100644 index 0000000000..ae0563a96a --- /dev/null +++ b/tensorflow/core/common_runtime/visitable_allocator.h @@ -0,0 +1,79 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ + +#include <functional> +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/tracking_allocator.h" + +namespace tensorflow { + +// Subclass VisitableAllocator instead of Allocator when a memory +// allocator needs to enable some kind of registration/deregistration +// of memory areas. +class VisitableAllocator : public Allocator { + public: + // Visitor gets called with a pointer to a memory area and its + // size in bytes. + typedef std::function<void(void*, size_t)> Visitor; + + // Register a visitor guaranteed to be called exactly once on each + // chunk of memory newly allocated from the underlying device. + // Typically, chunks will be reused and possibly sub-divided by a + // pool manager, so the calls will happen only once per process + // execution, not once per tensor (re)allocation. + virtual void AddAllocVisitor(Visitor visitor) = 0; + + // Register a visitor guaranteed to be called on each chunk of + // memory returned to the underlying device. + virtual void AddFreeVisitor(Visitor visitor) = 0; +}; + +// Needed for cases when a VisitableAllocator gets wrapped for tracking. +// Multiple-inheritance is considered acceptable in this case because +// VisitableAllocator is a pure virtual interface and only TrackingAllocator +// has default implementation. +class TrackingVisitableAllocator : public TrackingAllocator, + public VisitableAllocator { + public: + TrackingVisitableAllocator(VisitableAllocator* allocator, bool track_ids) + : TrackingAllocator(allocator, track_ids), allocator_(allocator) {} + ~TrackingVisitableAllocator() override {} + + string Name() override { return TrackingAllocator::Name(); } + + void* AllocateRaw(size_t alignment, size_t num_bytes) override { + return TrackingAllocator::AllocateRaw(alignment, num_bytes); + } + + void DeallocateRaw(void* ptr) override { + TrackingAllocator::DeallocateRaw(ptr); + } + + void AddAllocVisitor(Visitor visitor) override { + allocator_->AddAllocVisitor(visitor); + } + + void AddFreeVisitor(Visitor visitor) override { + allocator_->AddFreeVisitor(visitor); + } + + protected: + VisitableAllocator* allocator_; +}; +} // namespace tensorflow +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc index 84cee5569c..2a7ee16a16 100644 --- a/tensorflow/core/framework/allocator.cc +++ b/tensorflow/core/framework/allocator.cc @@ -196,7 +196,7 @@ class CPUAllocatorFactory : public AllocatorFactory { class CPUSubAllocator : public SubAllocator { public: explicit CPUSubAllocator(CPUAllocator* cpu_allocator) - : SubAllocator({}, {}), cpu_allocator_(cpu_allocator) {} + : cpu_allocator_(cpu_allocator) {} void* Alloc(size_t alignment, size_t num_bytes) override { return cpu_allocator_->AllocateRaw(alignment, num_bytes); @@ -222,22 +222,4 @@ Allocator* cpu_allocator() { } return cpu_alloc; } - -SubAllocator::SubAllocator(const std::vector<Visitor>& alloc_visitors, - const std::vector<Visitor>& free_visitors) - : alloc_visitors_(alloc_visitors), free_visitors_(free_visitors) {} - -void SubAllocator::VisitAlloc(void* ptr, int index, size_t num_bytes) { - for (const auto& v : alloc_visitors_) { - v(ptr, index, num_bytes); - } -} - -void SubAllocator::VisitFree(void* ptr, int index, size_t num_bytes) { - // Although we don't guarantee any order of visitor application, strive - // to apply free visitors in reverse order of alloc visitors. - for (int i = free_visitors_.size() - 1; i >= 0; --i) { - free_visitors_[i](ptr, index, num_bytes); - } -} } // namespace tensorflow diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h index 8c23604625..ded120b704 100644 --- a/tensorflow/core/framework/allocator.h +++ b/tensorflow/core/framework/allocator.h @@ -24,7 +24,6 @@ limitations under the License. #include "tensorflow/core/framework/resource_handle.h" #include "tensorflow/core/framework/type_traits.h" #include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" namespace tensorflow { @@ -388,36 +387,13 @@ void EnableCPUAllocatorStats(bool enable); // full statistics. By default, it's disabled. void EnableCPUAllocatorFullStats(bool enable); -// An object that does the underlying suballoc/free of memory for a higher-level -// allocator. The expectation is that the higher-level allocator is doing some -// kind of cache or pool management so that it will call SubAllocator::Alloc and -// Free relatively infrequently, compared to the number of times its own -// AllocateRaw and Free methods are called. +// Abstract interface of an object that does the underlying suballoc/free of +// memory for a higher-level allocator. class SubAllocator { public: - // Visitor gets called with a pointer to a memory area and its - // size in bytes. The index value will be numa_node for a CPU - // allocator and GPU id for a GPU allocator. - typedef std::function<void(void*, int index, size_t)> Visitor; - - SubAllocator(const std::vector<Visitor>& alloc_visitors, - const std::vector<Visitor>& free_visitors); - virtual ~SubAllocator() {} virtual void* Alloc(size_t alignment, size_t num_bytes) = 0; virtual void Free(void* ptr, size_t num_bytes) = 0; - - protected: - // Implementation of Alloc() method must call this on newly allocated - // value. - void VisitAlloc(void* ptr, int index, size_t num_bytes); - - // Implementation of Free() method must call this on value to be - // freed immediately before deallocation. - void VisitFree(void* ptr, int index, size_t num_bytes); - - const std::vector<Visitor> alloc_visitors_; - const std::vector<Visitor> free_visitors_; }; } // namespace tensorflow diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h index 53ac639b4c..794250a2c1 100644 --- a/tensorflow/core/framework/device_base.h +++ b/tensorflow/core/framework/device_base.h @@ -214,12 +214,10 @@ class DeviceBase { // This is overridden by GPU devices to reinitialize the derived // type returned by MakeGpuDevice. - virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/, - PerOpGpuDevice* /*device*/, - DeviceContext* /*dc*/, - Allocator* /*allocator*/) { - return Status::OK(); - } + virtual void ReinitializeGpuDevice(OpKernelContext* /*context*/, + PerOpGpuDevice* /*device*/, + DeviceContext* /*dc*/, + Allocator* /*allocator*/) {} // Unimplemented by default virtual const DeviceAttributes& attributes() const; diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index 3e34bf0418..80f2b12987 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -265,12 +265,9 @@ OpKernelContext::OpKernelContext(Params* params, int num_outputs) params_->ensure_eigen_gpu_device(); if (params_->eigen_gpu_device != nullptr) { Allocator* eigen_gpu_allocator = get_allocator(AllocatorAttributes()); - Status s = params_->device->ReinitializeGpuDevice( - this, params_->eigen_gpu_device, params_->op_device_context, - eigen_gpu_allocator); - if (!s.ok()) { - SetStatus(s); - } + params_->device->ReinitializeGpuDevice(this, params_->eigen_gpu_device, + params_->op_device_context, + eigen_gpu_allocator); } if (params_->record_tensor_accesses) { referenced_tensors_.Init(); |