diff options
Diffstat (limited to 'tensorflow')
32 files changed, 628 insertions, 577 deletions
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc index 726f74c7b7..bb06f1c41c 100644 --- a/tensorflow/contrib/gdr/gdr_memory_manager.cc +++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc @@ -138,6 +138,8 @@ class GdrMemoryManager : public RemoteMemoryManager { Device* device, DeviceContext* device_context, bool on_host, StatusCallback done) override; + static void RegMemVisitors(); + protected: Status CreateEndpoint(const string& host, const string& port, RdmaEndpointPtr& endpoint); @@ -183,35 +185,51 @@ class GdrMemoryManager : public RemoteMemoryManager { TF_DISALLOW_COPY_AND_ASSIGN(GdrMemoryManager); }; -// TODO(byronyi): remove this class and its registration when the default -// cpu_allocator() returns visitable allocator, or cpu_allocator() is no -// longer in use. -class BFCGdrAllocator : public BFCAllocator { - public: - BFCGdrAllocator() - : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36, - true, "cpu_gdr_bfc") {} -}; -class BFCGdrAllocatorFactory : public AllocatorFactory { - public: - Allocator* CreateAllocator() override { return new BFCGdrAllocator; } - - virtual SubAllocator* CreateSubAllocator(int numa_node) { - return new BasicCPUAllocator(numa_node); - } -}; - -REGISTER_MEM_ALLOCATOR("BFCGdrAllocator", 102, BFCGdrAllocatorFactory); - GdrMemoryManager::GdrMemoryManager(const string& host, const string& port) : host_(host), port_(port), listening_(nullptr, EndpointDeleter), stopped_(true), - next_key_(0) {} + next_key_(0) { + static std::once_flag flag; + std::call_once(flag, []() { RegMemVisitors(); }); +} GdrMemoryManager::~GdrMemoryManager() { close(epfd_); } +/*static*/ void GdrMemoryManager::RegMemVisitors() { + SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node, + size_t num_bytes) { + GdrMemoryManager::Singleton().InsertMemoryRegion( + ptr, num_bytes, strings::StrCat("CPU:", numa_node)); + }; + SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node, + size_t num_bytes) { + GdrMemoryManager::Singleton().EvictMemoryRegion(ptr, num_bytes); + }; + ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor); + ProcessState::singleton()->AddCPUFreeVisitor(free_visitor); + +#if GOOGLE_CUDA + if (IsGDRAvailable()) { + int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1; + + // Note we don't free allocated GPU memory so there is no free visitor + SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id, + size_t num_bytes) { + RdmaMemoryMgr::Singleton().InsertMemoryRegion( + ptr, num_bytes, strings::StrCat("GPU:", gpu_id)); + }; + GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id, + cuda_alloc_visitor); + GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id, + alloc_visitor); + GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor); + LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id; + } +#endif // GOOGLE_CUDA +} + Status GdrMemoryManager::Init() { epfd_ = epoll_create1(0); if (epfd_ == -1) { @@ -271,48 +289,6 @@ Status GdrMemoryManager::Init() { "cannot add server to epoll"); } - Allocator* allocators[] = { -#if GOOGLE_CUDA - GPUProcessState::singleton()->GetCUDAHostAllocator(0), -#endif // GOOGLE_CUDA - ProcessState::singleton()->GetCPUAllocator(0), - cpu_allocator(), - }; - - using namespace std::placeholders; - VisitableAllocator::Visitor alloc_visitor = - std::bind(&GdrMemoryManager::InsertMemoryRegion, this, _1, _2); - VisitableAllocator::Visitor free_visitor = - std::bind(&GdrMemoryManager::EvictMemoryRegion, this, _1, _2); - - std::set<Allocator*> instrumented_; - - // Host memory allocators - for (Allocator* allocator : allocators) { - auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator); - CHECK(visitable_allocator) - << "is not visitable for instrumentation" << allocator->Name(); - // Make sure we don't instrument the same allocator twice - if (instrumented_.find(allocator) == std::end(instrumented_)) { - visitable_allocator->AddAllocVisitor(alloc_visitor); - visitable_allocator->AddFreeVisitor(free_visitor); - instrumented_.insert(allocator); - LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name(); - } - } - -#if GOOGLE_CUDA - VisitableAllocator::Visitor cuda_alloc_visitor = - std::bind(&GdrMemoryManager::InsertMemoryRegion, this, _1, _2); - if (IsGDRAvailable()) { - // Note we don't free allocated GPU memory so there is no free visitor - int32_t bus_id = TryToReadNumaNode(listening_->verbs->device) + 1; - GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id, - cuda_alloc_visitor); - LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id; - } -#endif // GOOGLE_CUDA - return Status::OK(); } diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc index 3cb5e61fac..2784bf124c 100644 --- a/tensorflow/contrib/verbs/rdma_mgr.cc +++ b/tensorflow/contrib/verbs/rdma_mgr.cc @@ -20,7 +20,6 @@ limitations under the License. #include <vector> #include "tensorflow/contrib/verbs/grpc_verbs_client.h" #include "tensorflow/contrib/verbs/verbs_service.pb.h" -#include "tensorflow/core/common_runtime/bfc_allocator.h" #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h" #include "tensorflow/core/common_runtime/gpu/gpu_util.h" #include "tensorflow/core/common_runtime/pool_allocator.h" @@ -29,6 +28,7 @@ limitations under the License. #include "tensorflow/core/distributed_runtime/session_mgr.h" #include "tensorflow/core/framework/allocator_registry.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/strcat.h" namespace tensorflow { @@ -256,74 +256,41 @@ void MRDeleter(ibv_mr* mr) { } } -// TODO(byronyi): remove this class and its registration when the default -// cpu_allocator() returns visitable allocator, or cpu_allocator() is no -// longer in use. -class BFCRdmaAllocator : public BFCAllocator { - public: - BFCRdmaAllocator() - : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36, - true, "cpu_rdma_bfc") {} -}; -class BFCRdmaAllocatorFactory : public AllocatorFactory { - public: - Allocator* CreateAllocator() { return new BFCRdmaAllocator; } - - SubAllocator* CreateSubAllocator(int numa_node) { - return new BasicCPUAllocator(numa_node); - } -}; - -REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocatorFactory); - void RdmaMgr::InitAllocators() { - RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_; + static std::once_flag flag; + std::call_once( + flag, [this]() { RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_; }); +} - Allocator* allocators[] = { -#if GOOGLE_CUDA - GPUProcessState::singleton()->GetCUDAHostAllocator(0), -#endif // GOOGLE_CUDA - ProcessState::singleton()->GetCPUAllocator(0), - cpu_allocator(), +/*static*/ void RdmaMgr::RegMemVisitors() { + SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node, + size_t num_bytes) { + RdmaMemoryMgr::Singleton().InsertMemoryRegion( + ptr, num_bytes, strings::StrCat("CPU:", numa_node)); + }; + SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node, + size_t num_bytes) { + RdmaMemoryMgr::Singleton().EvictMemoryRegion(ptr, num_bytes); }; - using namespace std::placeholders; - - std::set<Allocator*> instrumented_; - - // Host memory allocators - for (Allocator* allocator : allocators) { - VisitableAllocator::Visitor alloc_visitor = - std::bind(&RdmaMemoryMgr::InsertMemoryRegion, - &RdmaMemoryMgr::Singleton(), _1, _2, allocator->Name()); - VisitableAllocator::Visitor free_visitor = std::bind( - &RdmaMemoryMgr::EvictMemoryRegion, &RdmaMemoryMgr::Singleton(), _1, _2); - - auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator); - CHECK(visitable_allocator) - << "is not visitable for instrumentation" << allocator->Name(); - // Make sure we don't instrument the same allocator twice - if (instrumented_.find(allocator) == std::end(instrumented_)) { - visitable_allocator->AddAllocVisitor(alloc_visitor); - visitable_allocator->AddFreeVisitor(free_visitor); - instrumented_.insert(allocator); - LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name(); - } - } + ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor); + ProcessState::singleton()->AddCPUFreeVisitor(free_visitor); #if GOOGLE_CUDA if (IsGDRAvailable()) { // Note we don't free allocated GPU memory so there is no free visitor int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1; - char buf[8]; - sprintf(buf, "gpu"); - VisitableAllocator::Visitor cuda_alloc_visitor = - std::bind(&RdmaMemoryMgr::InsertMemoryRegion, - &RdmaMemoryMgr::Singleton(), _1, _2, std::string(buf)); - + SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id, + size_t num_bytes) { + RdmaMemoryMgr::Singleton().InsertMemoryRegion( + ptr, num_bytes, strings::StrCat("GPU:", gpu_id)); + }; GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id, cuda_alloc_visitor); + GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id, + alloc_visitor); + GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor); LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id; } #endif // GOOGLE_CUDA diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h index 9fffc335bb..74b92cc9a6 100644 --- a/tensorflow/contrib/verbs/rdma_mgr.h +++ b/tensorflow/contrib/verbs/rdma_mgr.h @@ -39,6 +39,7 @@ class RdmaMgr { void SetupChannels(); bool ConnectivityCheck(); void InitAllocators(); + static void RegMemVisitors(); const string& local_worker() { return local_worker_; } private: diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc index 1a0b5028fe..61469686e4 100644 --- a/tensorflow/contrib/verbs/verbs_server_lib.cc +++ b/tensorflow/contrib/verbs/verbs_server_lib.cc @@ -76,8 +76,13 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def, return Status::OK(); } +namespace { +std::once_call reg_mem_visitors_call; +} // namespace + Status VerbsServer::Init(ServiceInitFunction service_func, RendezvousMgrCreationFunction rendezvous_mgr_func) { + std::call_once(reg_mem_visitors_call, []() { RdmaMgr::RegMemVisitors(); }); Status s = GrpcServer::Init(service_func, rendezvous_mgr_func); { mutex_lock l(mu_); diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index d55bd8d7ed..9bcf5b0865 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2783,7 +2783,6 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [ "common_runtime/step_stats_collector.h", "common_runtime/threadpool_device.h", "common_runtime/tracing_device.h", - "common_runtime/visitable_allocator.h", "common_runtime/process_state.h", "common_runtime/pool_allocator.h", "graph/gradients.h", diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index 84c6285bbe..3843ea9e60 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -31,7 +31,7 @@ namespace tensorflow { BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, bool allow_growth, const string& name) - : suballocator_(sub_allocator), + : sub_allocator_(sub_allocator), name_(name), free_chunks_list_(kInvalidChunkHandle), next_allocation_id_(1) { @@ -72,7 +72,7 @@ BFCAllocator::~BFCAllocator() { VLOG(2) << "Number of regions allocated: " << region_manager_.regions().size(); for (const auto& region : region_manager_.regions()) { - suballocator_->Free(region.ptr(), region.memory_size()); + sub_allocator_->Free(region.ptr(), region.memory_size()); } for (BinNum b = 0; b < kNumBins; b++) { @@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { // Try allocating. size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes); - void* mem_addr = suballocator_->Alloc(alignment, bytes); + void* mem_addr = sub_allocator_->Alloc(alignment, bytes); if (mem_addr == nullptr && !started_backpedal_) { // Only backpedal once. started_backpedal_ = true; @@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { while (mem_addr == nullptr) { bytes = RoundedBytes(bytes * kBackpedalFactor); if (bytes < rounded_bytes) break; - mem_addr = suballocator_->Alloc(alignment, bytes); + mem_addr = sub_allocator_->Alloc(alignment, bytes); } } @@ -158,10 +158,6 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { // Insert the chunk into the right bin. InsertFreeChunkIntoBin(h); - // Invoke visitors on newly allocated region. - for (const auto& visitor : region_visitors_) { - visitor(mem_addr, bytes); - } return true; } @@ -490,15 +486,6 @@ void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) { InsertFreeChunkIntoBin(coalesced_chunk); } -void BFCAllocator::AddAllocVisitor(Visitor visitor) { - VLOG(1) << "AddVisitor"; - mutex_lock l(lock_); - region_visitors_.push_back(visitor); - for (const auto& region : region_manager_.regions()) { - visitor(region.ptr(), region.memory_size()); - } -} - bool BFCAllocator::TracksAllocationSizes() { return true; } size_t BFCAllocator::RequestedSize(const void* ptr) { diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index 20e1dab1d5..364071e066 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -23,7 +23,7 @@ limitations under the License. #include <vector> #include "tensorflow/core/common_runtime/allocator_retry.h" -#include "tensorflow/core/common_runtime/visitable_allocator.h" +#include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/lib/gtl/stl_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/macros.h" @@ -42,7 +42,7 @@ namespace tensorflow { // coalescing. One assumption we make is that the process using this // allocator owns pretty much all of the memory, and that nearly // all requests to allocate memory go through this interface. -class BFCAllocator : public VisitableAllocator { +class BFCAllocator : public Allocator { public: // Takes ownership of sub_allocator. BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, @@ -55,11 +55,6 @@ class BFCAllocator : public VisitableAllocator { const AllocationAttributes& allocation_attr) override; void DeallocateRaw(void* ptr) override; - void AddAllocVisitor(Visitor visitor) override; - - // Does nothing, because memory is never freed. - void AddFreeVisitor(Visitor visitor) override {} - bool TracksAllocationSizes() override; size_t RequestedSize(const void* ptr) override; @@ -423,7 +418,7 @@ class BFCAllocator : public VisitableAllocator { // of the available memory. bool started_backpedal_ = false; - std::unique_ptr<SubAllocator> suballocator_; + std::unique_ptr<SubAllocator> sub_allocator_; string name_; // Structures mutable after construction @@ -435,9 +430,6 @@ class BFCAllocator : public VisitableAllocator { // Pointer to head of linked list of free Chunks ChunkHandle free_chunks_list_ GUARDED_BY(lock_); - // Called once on each region, ASAP. - std::vector<Visitor> region_visitors_ GUARDED_BY(lock_); - // Counter containing the next unique identifier to assign to a // newly-created chunk. int64 next_allocation_id_ GUARDED_BY(lock_); diff --git a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h index 636cd43575..6bd29ef775 100644 --- a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h +++ b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h @@ -26,8 +26,12 @@ namespace tensorflow { class CUDAHostAllocator : public SubAllocator { public: // Note: stream_exec cannot be null. - explicit CUDAHostAllocator(se::StreamExecutor* stream_exec) - : stream_exec_(stream_exec) { + explicit CUDAHostAllocator(se::StreamExecutor* stream_exec, int numa_node, + const std::vector<Visitor>& alloc_visitors, + const std::vector<Visitor>& free_visitors) + : SubAllocator(alloc_visitors, free_visitors), + stream_exec_(stream_exec), + numa_node_(numa_node) { CHECK(stream_exec_ != nullptr); } ~CUDAHostAllocator() override {} @@ -39,19 +43,23 @@ class CUDAHostAllocator : public SubAllocator { if (ptr == nullptr) { LOG(WARNING) << "could not allocate pinned host memory of size: " << num_bytes; + return ptr; } + VisitAlloc(ptr, numa_node_, num_bytes); } return ptr; } void Free(void* ptr, size_t num_bytes) override { if (ptr != nullptr) { + VisitFree(ptr, numa_node_, num_bytes); stream_exec_->HostMemoryDeallocate(ptr); } } private: se::StreamExecutor* stream_exec_; // not owned, non-null + const int numa_node_; TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator); }; diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc index 2d4c8d0201..44ffce77a1 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc @@ -22,18 +22,15 @@ limitations under the License. namespace tensorflow { -GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, - const string& name) - : GPUBFCAllocator(cuda_gpu_id, total_memory, GPUOptions(), name) {} +GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator, + size_t total_memory, const string& name) + : GPUBFCAllocator(sub_allocator, total_memory, GPUOptions(), name) {} -GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, +GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator, + size_t total_memory, const GPUOptions& gpu_options, const string& name) - : BFCAllocator( - new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), - gpu_options.per_process_gpu_memory_fraction() > 1.0 || - gpu_options.experimental().use_unified_memory()), - total_memory, gpu_options.allow_growth(), name) {} + : BFCAllocator(sub_allocator, total_memory, gpu_options.allow_growth(), + name) {} } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h index f1cc2eace1..6b6de80734 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h @@ -31,28 +31,20 @@ limitations under the License. namespace tensorflow { -// A GPU memory allocator that implements a 'best-fit with coalescing' -// algorithm. -class GPUBFCAllocator : public BFCAllocator { - public: - // 'cuda_gpu_id' refers to the ID of the GPU device within - // the process and must reference a valid ID in the process. - GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, - const string& name); - GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, - const GPUOptions& gpu_options, const string& name); - virtual ~GPUBFCAllocator() {} - - TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator); -}; - // Suballocator for GPU memory. class GPUMemAllocator : public SubAllocator { public: + // 'cuda_gpu_id' refers to the ID of the GPU device within + // the process and must reference a valid ID in the process. // Note: stream_exec cannot be null. - explicit GPUMemAllocator(se::StreamExecutor* stream_exec, - bool use_unified_memory) - : stream_exec_(stream_exec), use_unified_memory_(use_unified_memory) { + explicit GPUMemAllocator(se::StreamExecutor* stream_exec, CudaGpuId gpu_id, + bool use_unified_memory, + const std::vector<Visitor>& alloc_visitors, + const std::vector<Visitor>& free_visitors) + : SubAllocator(alloc_visitors, free_visitors), + stream_exec_(stream_exec), + gpu_id_(gpu_id), + use_unified_memory_(use_unified_memory) { CHECK(stream_exec_ != nullptr); } ~GPUMemAllocator() override {} @@ -65,12 +57,14 @@ class GPUMemAllocator : public SubAllocator { } else { ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque(); } + VisitAlloc(ptr, gpu_id_.value(), num_bytes); } return ptr; } void Free(void* ptr, size_t num_bytes) override { if (ptr != nullptr) { + VisitFree(ptr, gpu_id_.value(), num_bytes); if (use_unified_memory_) { stream_exec_->UnifiedMemoryDeallocate(ptr); } else { @@ -82,11 +76,25 @@ class GPUMemAllocator : public SubAllocator { private: se::StreamExecutor* stream_exec_; // not owned, non-null + const CudaGpuId gpu_id_; const bool use_unified_memory_ = false; TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator); }; +// A GPU memory allocator that implements a 'best-fit with coalescing' +// algorithm. +class GPUBFCAllocator : public BFCAllocator { + public: + GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory, + const string& name); + GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory, + const GPUOptions& gpu_options, const string& name); + ~GPUBFCAllocator() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator); +}; + } // namespace tensorflow #endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc index 67caeb3495..7112c3afd4 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc @@ -21,6 +21,7 @@ limitations under the License. #include <vector> #include "tensorflow/core/common_runtime/gpu/gpu_id.h" +#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h" #include "tensorflow/core/common_runtime/gpu/gpu_init.h" #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/gtl/inlined_vector.h" @@ -46,7 +47,11 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use, } TEST(GPUBFCAllocatorTest, NoDups) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); // Allocate a lot of raw pointers @@ -75,7 +80,11 @@ TEST(GPUBFCAllocatorTest, NoDups) { } TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); // Allocate 256 raw pointers of sizes between 100 bytes and about // a meg random::PhiloxRandom philox(123, 17); @@ -133,7 +142,11 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { } TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); float* first_ptr = a.Allocate<float>(1024); @@ -168,18 +181,30 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { } TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); float* ptr = a.Allocate<float>(0); EXPECT_EQ(nullptr, ptr); } TEST(GPUBFCAllocatorTest, TracksSizes) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); EXPECT_EQ(true, a.TracksAllocationSizes()); } TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); float* t1 = a.Allocate<float>(1); EXPECT_EQ(4, a.RequestedSize(t1)); EXPECT_EQ(256, a.AllocatedSize(t1)); @@ -187,8 +212,12 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { } TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) { + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); // Configure a 1MiB byte limit - GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc"); + GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc"); float* first_ptr = a.Allocate<float>(1 << 6); float* second_ptr = a.Allocate<float>(1 << 20); @@ -203,7 +232,11 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { options.set_allow_growth(true); // Max of 2GiB, but starts out small. - GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc"); // Allocate 10 raw pointers of sizes between 100 bytes and about // 64 megs. @@ -264,8 +297,15 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { } TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { - GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); - GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc"); + sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc"); void* amem = a.AllocateRaw(1, 1); void* bmem = b.AllocateRaw(1, 1 << 30); a.DeallocateRaw(amem); @@ -273,7 +313,11 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { } static void BM_Allocation(int iters) { - GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<size_t> sizes = {256, 4096, 16384, 524288, 512, 1048576, 10485760, 104857600, @@ -289,7 +333,11 @@ static void BM_Allocation(int iters) { BENCHMARK(BM_Allocation); static void BM_AllocationThreaded(int iters, int num_threads) { - GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc"); thread::ThreadPool pool(Env::Default(), "test", num_threads); std::atomic_int_fast32_t count(iters); mutex done_lock; @@ -325,7 +373,11 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16); // A more complex benchmark that defers deallocation of an object for // "delay" allocations. static void BM_AllocationDelayed(int iters, int delay) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024}; int size_index = 0; @@ -363,7 +415,11 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { // only methods inside this class can access private members of BFCAllocator. void TestBinDebugInfo() { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); std::vector<void*> initial_ptrs; std::vector<size_t> initial_ptrs_allocated_sizes; @@ -441,7 +497,11 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { } void TestLog2FloorNonZeroSlow() { - GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc"); EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0)); EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1)); EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2)); diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc index 934a57a5fb..8e14f1ea75 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc @@ -27,7 +27,7 @@ limitations under the License. namespace tensorflow { -GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator, +GPUcudaMallocAllocator::GPUcudaMallocAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id) : base_allocator_(allocator) { stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -60,14 +60,6 @@ void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) { #endif // GOOGLE_CUDA } -void GPUcudaMallocAllocator::AddAllocVisitor(Visitor visitor) { - return base_allocator_->AddAllocVisitor(visitor); -} - -void GPUcudaMallocAllocator::AddFreeVisitor(Visitor visitor) { - return base_allocator_->AddFreeVisitor(visitor); -} - bool GPUcudaMallocAllocator::TracksAllocationSizes() { return false; } } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h index 856fdc34b4..3d1d0ef481 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h @@ -19,7 +19,7 @@ limitations under the License. #include <memory> #include "tensorflow/core/common_runtime/gpu/gpu_id.h" -#include "tensorflow/core/common_runtime/visitable_allocator.h" +#include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/types.h" @@ -29,20 +29,17 @@ namespace tensorflow { // An allocator that wraps a GPU allocator and adds debugging // functionality that verifies that users do not write outside their // allocated memory. -class GPUcudaMallocAllocator : public VisitableAllocator { +class GPUcudaMallocAllocator : public Allocator { public: - explicit GPUcudaMallocAllocator(VisitableAllocator* allocator, - CudaGpuId cuda_gpu_id); + explicit GPUcudaMallocAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); ~GPUcudaMallocAllocator() override; string Name() override { return "gpu_debug"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; void DeallocateRaw(void* ptr) override; - void AddAllocVisitor(Visitor visitor) override; - void AddFreeVisitor(Visitor visitor) override; bool TracksAllocationSizes() override; private: - VisitableAllocator* base_allocator_ = nullptr; // owned + Allocator* base_allocator_ = nullptr; // owned se::StreamExecutor* stream_exec_; // Not owned. diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc index e4c834b30d..6bad66dcec 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc @@ -73,7 +73,7 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) { // ----------------------------------------------------------------------------- // GPUDebugAllocator // ----------------------------------------------------------------------------- -GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator, +GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id) : base_allocator_(allocator) { stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -111,14 +111,6 @@ void GPUDebugAllocator::DeallocateRaw(void* ptr) { base_allocator_->DeallocateRaw(ptr); } -void GPUDebugAllocator::AddAllocVisitor(Visitor visitor) { - return base_allocator_->AddAllocVisitor(visitor); -} - -void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) { - return base_allocator_->AddFreeVisitor(visitor); -} - bool GPUDebugAllocator::TracksAllocationSizes() { return true; } size_t GPUDebugAllocator::RequestedSize(const void* ptr) { @@ -158,7 +150,7 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) { // ----------------------------------------------------------------------------- // GPUNanResetAllocator // ----------------------------------------------------------------------------- -GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator, +GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id) : base_allocator_(allocator) { stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -200,14 +192,6 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) { base_allocator_->DeallocateRaw(ptr); } -void GPUNanResetAllocator::AddAllocVisitor(Visitor visitor) { - return base_allocator_->AddAllocVisitor(visitor); -} - -void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) { - return base_allocator_->AddFreeVisitor(visitor); -} - size_t GPUNanResetAllocator::RequestedSize(const void* ptr) { return base_allocator_->RequestedSize(ptr); } diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h index 0f9b72040c..0f27ff4384 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h @@ -21,7 +21,7 @@ limitations under the License. #include <unordered_map> #include "tensorflow/core/common_runtime/gpu/gpu_id.h" -#include "tensorflow/core/common_runtime/visitable_allocator.h" +#include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/types.h" @@ -31,16 +31,13 @@ namespace tensorflow { // An allocator that wraps a GPU allocator and adds debugging // functionality that verifies that users do not write outside their // allocated memory. -class GPUDebugAllocator : public VisitableAllocator { +class GPUDebugAllocator : public Allocator { public: - explicit GPUDebugAllocator(VisitableAllocator* allocator, - CudaGpuId cuda_gpu_id); + explicit GPUDebugAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); ~GPUDebugAllocator() override; string Name() override { return "gpu_debug"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; void DeallocateRaw(void* ptr) override; - void AddAllocVisitor(Visitor visitor) override; - void AddFreeVisitor(Visitor visitor) override; bool TracksAllocationSizes() override; size_t RequestedSize(const void* ptr) override; size_t AllocatedSize(const void* ptr) override; @@ -53,7 +50,7 @@ class GPUDebugAllocator : public VisitableAllocator { bool CheckFooter(void* ptr); private: - VisitableAllocator* base_allocator_ = nullptr; // owned + Allocator* base_allocator_ = nullptr; // owned se::StreamExecutor* stream_exec_; // Not owned. @@ -63,23 +60,20 @@ class GPUDebugAllocator : public VisitableAllocator { // An allocator that wraps a GPU allocator and resets the memory on // allocation and free to 'NaN', helping to identify cases where the // user forgets to initialize the memory. -class GPUNanResetAllocator : public VisitableAllocator { +class GPUNanResetAllocator : public Allocator { public: - explicit GPUNanResetAllocator(VisitableAllocator* allocator, - CudaGpuId cuda_gpu_id); + explicit GPUNanResetAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); ~GPUNanResetAllocator() override; string Name() override { return "gpu_nan_reset"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; void DeallocateRaw(void* ptr) override; - void AddAllocVisitor(Visitor visitor) override; - void AddFreeVisitor(Visitor visitor) override; size_t RequestedSize(const void* ptr) override; size_t AllocatedSize(const void* ptr) override; void GetStats(AllocatorStats* stats) override; void ClearStats() override; private: - VisitableAllocator* base_allocator_ = nullptr; // owned + Allocator* base_allocator_ = nullptr; // owned se::StreamExecutor* stream_exec_; // Not owned. diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc index 236a0afa0b..98283cd846 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc @@ -35,7 +35,10 @@ namespace { TEST(GPUDebugAllocatorTest, OverwriteDetection_None) { const CudaGpuId cuda_gpu_id(0); - GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -59,7 +62,10 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) { EXPECT_DEATH( { const CudaGpuId cuda_gpu_id(0); - GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), + cuda_gpu_id, false /*use_unified_memory*/, {}, {}); + GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -92,7 +98,10 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) { EXPECT_DEATH( { const CudaGpuId cuda_gpu_id(0); - GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), + cuda_gpu_id, false /*use_unified_memory*/, {}, {}); + GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -122,7 +131,10 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) { TEST(GPUDebugAllocatorTest, ResetToNan) { const CudaGpuId cuda_gpu_id(0); - GPUNanResetAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -163,8 +175,11 @@ TEST(GPUDebugAllocatorTest, ResetToNan) { TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { const CudaGpuId cuda_gpu_id(0); // NaN reset must be the outer-most allocator. + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); GPUNanResetAllocator a( - new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), + new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), cuda_gpu_id), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -205,15 +220,21 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { TEST(GPUDebugAllocatorTest, TracksSizes) { const CudaGpuId cuda_gpu_id(0); - GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), cuda_gpu_id); EXPECT_EQ(true, a.TracksAllocationSizes()); } TEST(GPUDebugAllocatorTest, AllocatedVsRequested) { const CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); GPUNanResetAllocator a( - new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), + new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), cuda_gpu_id), cuda_gpu_id); float* t1 = a.Allocate<float>(1); diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index 2763ac0d4a..50e61b7e00 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -41,7 +41,6 @@ limitations under the License. #include "tensorflow/core/common_runtime/gpu/gpu_util.h" #include "tensorflow/core/common_runtime/gpu_device_context.h" #include "tensorflow/core/common_runtime/local_device.h" -#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/framework/op_kernel.h" @@ -285,6 +284,38 @@ BaseGPUDevice::~BaseGPUDevice() { for (auto ctx : device_contexts_) ctx->Unref(); } +// This should be idempotent if already initialized. +Status BaseGPUDevice::InitScratchBuffers() { + mutex_lock l(scratch_init_mutex_); + if (scratch_.size() < max_streams_) { + for (int i = 0; i < max_streams_; i++) { + DCHECK(streams_[i]); + if (scratch_.size() > i && scratch_[i]) continue; + size_t scratch_buffer_size = + Eigen::kCudaScratchSize + sizeof(unsigned int); + void* scratch_buffer = gpu_allocator_->AllocateRaw( + Allocator::kAllocatorAlignment, scratch_buffer_size); + if (scratch_buffer == nullptr) { + return errors::FailedPrecondition( + "Failed to allocate scratch buffer for device ", + tf_gpu_id_.value()); + } + se::DeviceMemory<char> mem( + se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size)); + + bool ok = executor_->SynchronousMemZero( + &mem, Eigen::kCudaScratchSize + sizeof(unsigned int)); + if (!ok) { + return errors::FailedPrecondition( + "Failed to memcopy into scratch buffer for device ", + tf_gpu_id_.value()); + } + scratch_.push_back(static_cast<char*>(scratch_buffer)); + } + } + return Status::OK(); +} + Status BaseGPUDevice::Init(const SessionOptions& options) { auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_); if (!executor_status.status().ok()) { @@ -303,27 +334,6 @@ Status BaseGPUDevice::Init(const SessionOptions& options) { for (int i = 0; i < max_streams_; i++) { streams_.push_back(StreamGroupFactory::Global().GetOrCreate( tf_gpu_id_, i, executor_, options.config.gpu_options())); - - size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int); - void* scratch_buffer = gpu_allocator_->AllocateRaw( - Allocator::kAllocatorAlignment, scratch_buffer_size); - if (scratch_buffer == nullptr) { - return errors::FailedPrecondition( - "Failed to allocate scratch buffer for device ", tf_gpu_id_.value()); - } - scratch_.push_back(static_cast<char*>(scratch_buffer)); - - se::DeviceMemory<char> mem( - se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size)); - - bool ok = executor_->SynchronousMemZero( - &mem, Eigen::kCudaScratchSize + sizeof(unsigned int)); - if (!ok) { - return errors::FailedPrecondition( - "Failed to memcopy into scratch buffer for device ", - tf_gpu_id_.value()); - } - device_contexts_.push_back(new GPUDeviceContext( i, streams_.back()->compute, streams_.back()->host_to_device, streams_.back()->device_to_host, streams_.back()->device_to_device)); @@ -867,10 +877,11 @@ PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() { return new ConcretePerOpGpuDevice(); } -void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, - PerOpGpuDevice* device, - DeviceContext* dc, - Allocator* allocator) { +Status BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, + PerOpGpuDevice* device, + DeviceContext* dc, + Allocator* allocator) { + TF_RETURN_IF_ERROR(InitScratchBuffers()); if (dc) { const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc); const int stream_id = gpu_dc->stream_id(); @@ -881,6 +892,7 @@ void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, } else { ReinitializeDevice(context, device, 0, allocator); } + return Status::OK(); } Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr, diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h index 56d03d7a8c..b3eea55758 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.h +++ b/tensorflow/core/common_runtime/gpu/gpu_device.h @@ -86,8 +86,9 @@ class BaseGPUDevice : public LocalDevice { // The caller owns the returned device. PerOpGpuDevice* MakeGpuDevice() override; - void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, - DeviceContext* dc, Allocator* allocator) override; + Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, + DeviceContext* dc, + Allocator* allocator) override; // Returns the CUDA GPU id of this device within the native driver system; // e.g., for CUDA this is the ordinal of the GPU within the system. @@ -125,6 +126,7 @@ class BaseGPUDevice : public LocalDevice { class StreamGroupFactory; gtl::InlinedVector<StreamGroup*, 4> streams_; + mutex scratch_init_mutex_; gtl::InlinedVector<char*, 4> scratch_; std::vector<GPUDeviceContext*> device_contexts_; GpuDeviceInfo* gpu_device_info_ = nullptr; @@ -135,6 +137,9 @@ class BaseGPUDevice : public LocalDevice { std::unique_ptr<EventMgr> em_; std::unique_ptr<thread::ThreadPool> thread_pool_; + // Initialize scractch buffers used by Eigen. + Status InitScratchBuffers(); + void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device, int stream_id, Allocator* allocator); diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc index b18688174d..9ec740fabe 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc @@ -76,12 +76,16 @@ GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) { // This function is defined for debugging problems with the allocators. GPUProcessState::~GPUProcessState() { CHECK_EQ(this, instance_); - for (auto p : gpu_allocators_) { - delete p; - } instance_ = nullptr; } +int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) { + // Return the NUMA node associated with the GPU's StreamExecutor. + se::StreamExecutor* se = + GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie(); + return se->GetDeviceDescription().numa_node(); +} + Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, TfGpuId tf_gpu_id, size_t total_bytes) { @@ -93,13 +97,10 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) { gpu_allocators_.resize(tf_gpu_id.value() + 1); - if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) - gpu_al_.resize(tf_gpu_id.value() + 1); } - if (gpu_allocators_[tf_gpu_id.value()] == nullptr) { - VisitableAllocator* gpu_allocator; - + AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()]; + if (allocator_parts.allocator.get() == nullptr) { // Validate allocator types. if (!allocator_type.empty() && allocator_type != "BFC") { LOG(ERROR) << "Invalid allocator type: " << allocator_type; @@ -108,8 +109,17 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, CudaGpuId cuda_gpu_id; TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); - gpu_allocator = - new GPUBFCAllocator(cuda_gpu_id, total_bytes, options, + int bus_id = BusIdForGPU(tf_gpu_id); + while (bus_id >= gpu_visitors_.size()) { + gpu_visitors_.push_back({}); + } + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + (options.per_process_gpu_memory_fraction() > 1.0 || + options.experimental().use_unified_memory()), + gpu_visitors_[bus_id], {}); + Allocator* gpu_allocator = + new GPUBFCAllocator(sub_allocator, total_bytes, options, strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc")); // If true, checks for memory overwrites by writing @@ -123,34 +133,25 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, // **WARNING** probably will not work in a multi-gpu scenario gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, cuda_gpu_id); } - gpu_allocators_[tf_gpu_id.value()] = gpu_allocator; - - // If there are any pending AllocVisitors for this bus, add - // them now. - se::StreamExecutor* se = - GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie(); - int bus_id = se->GetDeviceDescription().numa_node(); - if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) { - for (const auto& v : gpu_visitors_[bus_id]) { - gpu_allocator->AddAllocVisitor(v); - } - } + + Allocator* recording_allocator = nullptr; if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { ProcessState::MemDesc md; md.loc = ProcessState::MemDesc::GPU; md.dev_index = cuda_gpu_id.value(); md.gpu_registered = false; md.nic_registered = true; - if (static_cast<int64>(gpu_al_.size()) <= tf_gpu_id.value()) { - gpu_al_.resize(tf_gpu_id.value() + 1); - } - gpu_al_[tf_gpu_id.value()] = new internal::RecordingAllocator( + recording_allocator = new internal::RecordingAllocator( &process_state_->mem_desc_map_, gpu_allocator, md, &mu_); } + allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator), sub_allocator, + std::unique_ptr<Allocator>(recording_allocator)}; + } + if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { + return allocator_parts.recording_allocator.get(); + } else { + return allocator_parts.allocator.get(); } - if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) - return gpu_al_[tf_gpu_id.value()]; - return gpu_allocators_[tf_gpu_id.value()]; #else LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda."; return nullptr; @@ -172,11 +173,12 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { tf_shared_lock lock(mu_); if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types && - static_cast<int>(cuda_al_.size()) > 0) { - return cuda_al_[0]; + !cuda_host_allocators_.empty() && + cuda_host_allocators_[0].recording_allocator != nullptr) { + return cuda_host_allocators_[0].recording_allocator.get(); } if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) { - return cuda_host_allocators_[0]; + return cuda_host_allocators_[0].allocator.get(); } } @@ -190,7 +192,7 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { // it knows is valid. se::StreamExecutor* se = nullptr; for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) { - if (gpu_allocators_[i] != nullptr) { + if (gpu_allocators_[i].allocator != nullptr) { se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie(); break; } @@ -199,6 +201,15 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { CHECK_NE(nullptr, se); while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) { + while (cuda_host_alloc_visitors_.size() <= numa_node) { + cuda_host_alloc_visitors_.push_back({}); + } + while (cuda_host_free_visitors_.size() <= numa_node) { + cuda_host_free_visitors_.push_back({}); + } + SubAllocator* sub_allocator = new CUDAHostAllocator( + se, numa_node, cuda_host_alloc_visitors_[numa_node], + cuda_host_free_visitors_[numa_node]); // TODO(zheng-xq): evaluate whether 64GB by default is the best choice. int64 cuda_host_mem_limit_in_mb = -1; Status status = ReadInt64FromEnvVar("TF_CUDA_HOST_MEM_LIMIT_IN_MB", @@ -208,62 +219,92 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { LOG(ERROR) << "GetCUDAHostAllocator: " << status.error_message(); } int64 cuda_host_mem_limit = cuda_host_mem_limit_in_mb * (1LL << 20); - VisitableAllocator* allocator = - new BFCAllocator(new CUDAHostAllocator(se), cuda_host_mem_limit, + Allocator* allocator = + new BFCAllocator(sub_allocator, cuda_host_mem_limit, true /*allow_growth*/, "cuda_host_bfc" /*name*/); - if (LogMemory::IsEnabled()) { + if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) { // Wrap the allocator to track allocation ids for better logging // at the cost of performance. - allocator = new TrackingVisitableAllocator(allocator, true); + allocator = new TrackingAllocator(allocator, true); } - cuda_host_allocators_.push_back(allocator); + cuda_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator), + sub_allocator, + std::unique_ptr<Allocator>(nullptr)}); + AllocatorParts& allocator_parts = cuda_host_allocators_.back(); if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { ProcessState::MemDesc md; md.loc = ProcessState::MemDesc::CPU; md.dev_index = 0; md.gpu_registered = true; md.nic_registered = false; - cuda_al_.push_back(new internal::RecordingAllocator( - &process_state_->mem_desc_map_, cuda_host_allocators_.back(), md, - &mu_)); + allocator_parts.recording_allocator.reset( + new internal::RecordingAllocator(&process_state_->mem_desc_map_, + allocator_parts.allocator.get(), md, + &mu_)); } } - if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) - return cuda_al_[0]; - return cuda_host_allocators_[0]; + if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { + return cuda_host_allocators_[0].recording_allocator.get(); + } else { + return cuda_host_allocators_[0].allocator.get(); + } } void GPUProcessState::AddGPUAllocVisitor(int bus_id, - const AllocVisitor& visitor) { - CHECK(process_state_); + const SubAllocator::Visitor& visitor) { #if GOOGLE_CUDA mutex_lock lock(mu_); - for (int i = 0; i < static_cast<int64>(gpu_allocators_.size()); ++i) { - se::StreamExecutor* se = - GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie(); - if (gpu_allocators_[i] && - (se->GetDeviceDescription().numa_node() + 1) == bus_id) { - gpu_allocators_[i]->AddAllocVisitor(visitor); - } - } + CHECK(gpu_allocators_.empty()) // Crash OK + << "AddGPUAllocVisitor must be called before " + "first call to GetGPUAllocator."; while (bus_id >= static_cast<int64>(gpu_visitors_.size())) { - gpu_visitors_.push_back(std::vector<AllocVisitor>()); + gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>()); } gpu_visitors_[bus_id].push_back(visitor); #endif // GOOGLE_CUDA } +void GPUProcessState::AddCUDAHostAllocVisitor( + int numa_node, const SubAllocator::Visitor& visitor) { +#if GOOGLE_CUDA + mutex_lock lock(mu_); + CHECK(cuda_host_allocators_.empty()) // Crash OK + << "AddCUDAHostAllocVisitor must be called before " + "first call to GetCUDAHostAllocator."; + while (numa_node >= static_cast<int64>(cuda_host_alloc_visitors_.size())) { + cuda_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>()); + } + cuda_host_alloc_visitors_[numa_node].push_back(visitor); +#endif // GOOGLE_CUDA +} + +void GPUProcessState::AddCUDAHostFreeVisitor( + int numa_node, const SubAllocator::Visitor& visitor) { +#if GOOGLE_CUDA + mutex_lock lock(mu_); + CHECK(cuda_host_allocators_.empty()) // Crash OK + << "AddCUDAHostFreeVisitor must be called before " + "first call to GetCUDAHostAllocator."; + while (numa_node >= static_cast<int64>(cuda_host_free_visitors_.size())) { + cuda_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>()); + } + cuda_host_free_visitors_[numa_node].push_back(visitor); +#endif // GOOGLE_CUDA +} + void GPUProcessState::TestOnlyReset() { - process_state_->ProcessState::TestOnlyReset(); + if (process_state_) { + process_state_->ProcessState::TestOnlyReset(); + } { mutex_lock lock(mu_); gpu_device_enabled_ = false; + gpu_allocators_.clear(); gpu_visitors_.clear(); - gtl::STLDeleteElements(&gpu_allocators_); - gtl::STLDeleteElements(&cuda_host_allocators_); - gtl::STLDeleteElements(&gpu_al_); - gtl::STLDeleteElements(&cuda_al_); + cuda_host_allocators_.clear(); + cuda_host_alloc_visitors_.clear(); + cuda_host_free_visitors_.clear(); } } diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h index cb41c3c6bd..43e9a31660 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h +++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h @@ -32,7 +32,6 @@ limitations under the License. namespace tensorflow { class Allocator; -class VisitableAllocator; class PoolAllocator; // Singleton that manages per-process state when GPUs are present. @@ -72,18 +71,30 @@ class GPUProcessState { virtual Allocator* GetCUDAHostAllocator(int numa_node); - // Registers a function to be called once on every new Region - // allocated by every GPURegionAllocator proximate to the specified - // bus. The AllocVisitor is provided with a memory pointer and the - // size of the area it identifies. The pointer is not guaranteed to - // be valid after the call terminates. The intention is for this - // interface to be used for network device memory registration. - // "bus_id" is platform-specific. On many platforms it - // should be 0. On machines with multiple PCIe buses, it should be - // the index of one of the PCIe buses. If the bus_id is invalid, - // results are undefined. - typedef std::function<void(void*, size_t)> AllocVisitor; - virtual void AddGPUAllocVisitor(int bus_id, const AllocVisitor& visitor); + // Registers a Visitor to be invoked on new chunks of memory allocated by the + // SubAllocator of every GPU proximate to the specified bus. The AllocVisitor + // is provided with a memory pointer, a GPU id, and the size of the area it + // identifies. The pointer is not guaranteed to be valid after the call + // terminates. The intention is for this interface to be used for network + // device memory registration. "bus_id" is platform-specific. On many + // platforms it should be 0. On machines with multiple PCIe buses, it should + // be the index of one of the PCIe buses (maybe the NUMA node at which the + // PCIe is rooted). If the bus_id is invalid, results are undefined. + virtual void AddGPUAllocVisitor(int bus_id, + const SubAllocator::Visitor& visitor); + + // Registers a Visitor to be invoked on new chunks of memory allocated by + // the SubAllocator of the CUDAHostAllocator for the given numa_node. + virtual void AddCUDAHostAllocVisitor(int numa_node, + const SubAllocator::Visitor& visitor); + + // Registers a Visitor to be invoked on each chunk handed back for freeing to + // the SubAllocator of the CUDAHostAllocator for the given numa_node. + virtual void AddCUDAHostFreeVisitor(int numa_node, + const SubAllocator::Visitor& visitor); + + // Returns bus_id for the given GPU id. + virtual int BusIdForGPU(TfGpuId tf_gpu_id); protected: GPUProcessState(); @@ -103,16 +114,21 @@ class GPUProcessState { mutex mu_; - std::vector<VisitableAllocator*> gpu_allocators_ GUARDED_BY(mu_); - std::vector<std::vector<AllocVisitor>> gpu_visitors_ GUARDED_BY(mu_); - std::vector<Allocator*> cuda_host_allocators_ GUARDED_BY(mu_); + struct AllocatorParts { + std::unique_ptr<Allocator> allocator; + SubAllocator* sub_allocator; // owned by allocator + std::unique_ptr<Allocator> recording_allocator; + }; + std::vector<AllocatorParts> gpu_allocators_ GUARDED_BY(mu_); + std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_ GUARDED_BY(mu_); - virtual ~GPUProcessState(); + std::vector<AllocatorParts> cuda_host_allocators_ GUARDED_BY(mu_); + std::vector<std::vector<SubAllocator::Visitor>> cuda_host_alloc_visitors_ + GUARDED_BY(mu_); + std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_ + GUARDED_BY(mu_); - // Optional RecordingAllocators that wrap the corresponding - // Allocators for runtime attribute use analysis. - std::vector<Allocator*> gpu_al_ GUARDED_BY(mu_); - std::vector<Allocator*> cuda_al_ GUARDED_BY(mu_); + virtual ~GPUProcessState(); friend class GPUDeviceTest; }; diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc index 583bff2c07..6b2f6547b0 100644 --- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc @@ -31,7 +31,8 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) { 2 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie()), + .ValueOrDie(), + 0 /*numa_node*/, {}, {}), new NoopRounder, "pool"); EXPECT_EQ(nullptr, pool.AllocateRaw(4 /*alignment*/, 0 /*num_bytes*/)); @@ -49,7 +50,8 @@ TEST(PoolAllocatorTest, ZeroSizePool) { 0 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie()), + .ValueOrDie(), + 0 /*numa_node*/, {}, {}), new NoopRounder, "pool"); EXPECT_EQ(0, pool.get_from_pool_count()); @@ -82,7 +84,8 @@ TEST(PoolAllocatorTest, Alignment) { 0 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie()), + .ValueOrDie(), + 0 /*numa_node*/, {}, {}), new NoopRounder, "pool"); for (int i = 0; i < 16; ++i) { size_t alignment = 1 << i; @@ -97,8 +100,8 @@ TEST(PoolAllocatorTest, Alignment) { TEST(PoolAllocatorTest, AutoResize) { PoolAllocator pool(2 /*pool_size_limit*/, true /*auto_resize*/, - new BasicCPUAllocator(0 /*numa_node*/), new NoopRounder, - "pool"); + new BasicCPUAllocator(0 /*numa_node*/, {}, {}), + new NoopRounder, "pool"); // Alloc/dealloc 10 sizes just a few times, confirming pool size // stays at 2. @@ -123,14 +126,32 @@ TEST(PoolAllocatorTest, AutoResize) { } TEST(PoolAllocatorTest, CudaHostAllocator) { + int alloc_count = 0; + int64 alloc_size = 0; + SubAllocator::Visitor alloc_visitor = + [&alloc_count, &alloc_size](void* ptr, int numa_node, int64 size) { + ++alloc_count; + alloc_size += size; + }; + int free_count = 0; + int64 free_size = 0; + SubAllocator::Visitor free_visitor = + [&free_count, &free_size](void* ptr, int numa_node, int64 size) { + ++free_count; + free_size += size; + }; se::Platform* platform = se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); - PoolAllocator pool( - 2 /*pool_size_limit*/, false /*auto_resize*/, - new CUDAHostAllocator( - platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie()), - new NoopRounder, "pool"); + CUDAHostAllocator* sub_allocator = new CUDAHostAllocator( + platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) + .ValueOrDie(), + 0 /*numa_node*/, {alloc_visitor}, {free_visitor}); + PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/, + sub_allocator, new NoopRounder, "pool"); + EXPECT_EQ(0, alloc_count); + EXPECT_EQ(0, alloc_size); + EXPECT_EQ(0, free_count); + EXPECT_EQ(0, free_size); // Repeatedly Get a 16-byte value, confirming that there's only // one real allocation. @@ -138,6 +159,10 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { EXPECT_EQ(0, pool.get_from_pool_count()); EXPECT_EQ(1, pool.allocated_count()); EXPECT_NE(nullptr, p1_16); + EXPECT_EQ(1, alloc_count); // Underlying suballoc of 16 bytes + // Each suballocation includes a 16B ChunkPrefix. + static const int kChunkPrefixSize = 16; + EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size); pool.DeallocateRaw(p1_16); // Pool contents {16} EXPECT_EQ(1, pool.put_count()); @@ -148,6 +173,9 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { pool.DeallocateRaw(p2_16); // Put it back. // Pool contents {16} EXPECT_EQ(2, pool.put_count()); + EXPECT_EQ(1, alloc_count); // Underlying suballoc of 16 bytes + EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size); + EXPECT_EQ(0, free_count); // Get two more values of different sizes. void* p3_4 = pool.AllocateRaw(4, 4); @@ -160,6 +188,9 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { void* p4_2 = pool.AllocateRaw(4, 2); // Get a third size buffer. EXPECT_NE(nullptr, p4_2); EXPECT_EQ(0, pool.evicted_count()); + EXPECT_EQ(3, alloc_count); + EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); + EXPECT_EQ(0, free_count); // The pool is full: when we put back p4_2, the 16-byte buffer // should be evicted since it was least recently inserted. @@ -167,6 +198,10 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { // Pool contents {2, 4} EXPECT_EQ(4, pool.put_count()); EXPECT_EQ(1, pool.evicted_count()); + EXPECT_EQ(3, alloc_count); + EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); + EXPECT_EQ(1, free_count); + EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size); // Re-getting and putting size 2 or 4 should not alter pool size or // num-evicted. @@ -180,12 +215,20 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { EXPECT_EQ(6, pool.put_count()); EXPECT_EQ(3, pool.allocated_count()); EXPECT_EQ(1, pool.evicted_count()); + EXPECT_EQ(3, alloc_count); + EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); + EXPECT_EQ(1, free_count); + EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size); pool.Clear(); EXPECT_EQ(0, pool.get_from_pool_count()); EXPECT_EQ(0, pool.put_count()); EXPECT_EQ(0, pool.allocated_count()); EXPECT_EQ(0, pool.evicted_count()); + EXPECT_EQ(3, alloc_count); + EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); + EXPECT_EQ(3, free_count); + EXPECT_EQ(16 + 4 + 2 + (free_count * kChunkPrefixSize), free_size); } TEST(PoolAllocatorTest, Pow2Rounder) { @@ -206,7 +249,8 @@ TEST(PoolAllocatorTest, Name) { 2 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie()), + .ValueOrDie(), + 0 /*numa_node*/, {}, {}), new NoopRounder, "pool"); EXPECT_EQ("pool", pool.Name()); } diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h index df9c3a686c..538a70668a 100644 --- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h +++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h @@ -23,12 +23,11 @@ limitations under the License. #include <cstdlib> #include "tensorflow/core/common_runtime/bfc_allocator.h" -#include "tensorflow/core/common_runtime/visitable_allocator.h" -#include "tensorflow/core/framework/allocator_registry.h" +#include "tensorflow/core/common_runtime/pool_allocator.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/mem.h" -#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/numa.h" #ifndef INTEL_MKL_DNN_ONLY #include "i_malloc.h" @@ -40,20 +39,16 @@ typedef unsigned int uint; namespace tensorflow { -class MklSubAllocator : public SubAllocator { +class MklSubAllocator : public BasicCPUAllocator { public: + MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {} ~MklSubAllocator() override {} - - void* Alloc(size_t alignment, size_t num_bytes) override { - return port::AlignedMalloc(num_bytes, alignment); - } - void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); } }; // CPU allocator that handles small-size allocations by calling // suballocator directly. Mostly, it is just a wrapper around a suballocator // (that calls malloc and free directly) with support for bookkeeping. -class MklSmallSizeAllocator : public VisitableAllocator { +class MklSmallSizeAllocator : public Allocator { public: MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory, const string& name) @@ -75,10 +70,6 @@ class MklSmallSizeAllocator : public VisitableAllocator { CHECK(map_.insert(map_val).second); // Increment statistics for small-size allocations. IncrementStats(num_bytes); - // Call alloc visitors. - for (const auto& visitor : alloc_visitors_) { - visitor(ptr, num_bytes); - } } return ptr; } @@ -94,9 +85,6 @@ class MklSmallSizeAllocator : public VisitableAllocator { if (map_iter != map_.end()) { // Call free visitors. size_t dealloc_bytes = map_iter->second; - for (const auto& visitor : free_visitors_) { - visitor(ptr, dealloc_bytes); - } sub_allocator_->Free(ptr, dealloc_bytes); DecrementStats(dealloc_bytes); map_.erase(map_iter); @@ -121,16 +109,6 @@ class MklSmallSizeAllocator : public VisitableAllocator { stats_.Clear(); } - void AddAllocVisitor(Visitor visitor) override { - mutex_lock l(mutex_); - alloc_visitors_.push_back(visitor); - } - - void AddFreeVisitor(Visitor visitor) override { - mutex_lock l(mutex_); - free_visitors_.push_back(visitor); - } - private: // Increment statistics for the allocator handling small allocations. inline void IncrementStats(size_t alloc_size) @@ -163,15 +141,11 @@ class MklSmallSizeAllocator : public VisitableAllocator { // Allocator stats for small allocs AllocatorStats stats_ GUARDED_BY(mutex_); - - // Visitors - std::vector<Visitor> alloc_visitors_ GUARDED_BY(mutex_); - std::vector<Visitor> free_visitors_ GUARDED_BY(mutex_); }; /// CPU allocator for MKL that wraps BFC allocator and intercepts /// and redirects memory allocation calls from MKL. -class MklCPUAllocator : public VisitableAllocator { +class MklCPUAllocator : public Allocator { public: // Constructor and other standard functions @@ -284,16 +258,6 @@ class MklCPUAllocator : public VisitableAllocator { large_size_allocator_->ClearStats(); } - void AddAllocVisitor(Visitor visitor) override { - small_size_allocator_->AddAllocVisitor(visitor); - large_size_allocator_->AddAllocVisitor(visitor); - } - - void AddFreeVisitor(Visitor visitor) override { - small_size_allocator_->AddFreeVisitor(visitor); - large_size_allocator_->AddFreeVisitor(visitor); - } - private: // Hooks provided by this allocator for memory allocation routines from MKL @@ -330,7 +294,7 @@ class MklCPUAllocator : public VisitableAllocator { // The alignment that we need for the allocations static constexpr const size_t kAlignment = 64; - VisitableAllocator* large_size_allocator_; // owned by this class + Allocator* large_size_allocator_; // owned by this class MklSmallSizeAllocator* small_size_allocator_; // owned by this class. SubAllocator* sub_allocator_; // not owned by this class diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc index fdad8de8d6..66dc8f3322 100644 --- a/tensorflow/core/common_runtime/pool_allocator.cc +++ b/tensorflow/core/common_runtime/pool_allocator.cc @@ -40,8 +40,7 @@ PoolAllocator::PoolAllocator(size_t pool_size_limit, bool auto_resize, auto_resize_(auto_resize), pool_size_limit_(pool_size_limit), allocator_(allocator), - size_rounder_(size_rounder), - allocation_begun_(false) { + size_rounder_(size_rounder) { if (auto_resize) { CHECK_LT(size_t{0}, pool_size_limit) << "size limit must be > 0 if auto_resize is true."; @@ -93,7 +92,6 @@ ChunkPrefix* FindPrefix(void* user_ptr) { } // namespace void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { - if (!allocation_begun_) allocation_begun_ = true; if (num_bytes == 0) return nullptr; // If alignment is larger than kPoolAlignment, increase num_bytes so that we @@ -129,9 +127,6 @@ void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { return PrepareChunk(r, alignment, num_bytes); } else { void* ptr = allocator_->Alloc(kPoolAlignment, num_bytes); - for (const auto& v : alloc_visitors_) { - v(ptr, num_bytes); - } return PrepareChunk(ptr, alignment, num_bytes); } } @@ -141,9 +136,6 @@ void PoolAllocator::DeallocateRaw(void* ptr) { ChunkPrefix* cp = FindPrefix(ptr); CHECK_LE((void*)cp, (void*)ptr); if (!has_size_limit_ && !auto_resize_) { - for (const auto& v : free_visitors_) { - v(cp, cp->num_bytes); - } allocator_->Free(cp, cp->num_bytes); } else { mutex_lock lock(mutex_); @@ -164,9 +156,6 @@ void PoolAllocator::Clear() { mutex_lock lock(mutex_); for (auto iter : pool_) { PtrRecord* pr = iter.second; - for (const auto& v : free_visitors_) { - v(pr->ptr, pr->num_bytes); - } allocator_->Free(pr->ptr, pr->num_bytes); delete pr; } @@ -221,9 +210,6 @@ void PoolAllocator::EvictOne() { DCHECK(iter != pool_.end()); } pool_.erase(iter); - for (const auto& v : free_visitors_) { - v(prec->ptr, prec->num_bytes); - } allocator_->Free(prec->ptr, prec->num_bytes); delete prec; ++evicted_count_; @@ -269,28 +255,19 @@ void PoolAllocator::EvictOne() { } } -void PoolAllocator::AddAllocVisitor(Visitor visitor) { - mutex_lock lock(mutex_); - CHECK(!allocation_begun_) - << "AddAllocVisitor may not be called after pool allocation " - << "has begun."; - alloc_visitors_.push_back(visitor); -} - -void PoolAllocator::AddFreeVisitor(Visitor visitor) { - mutex_lock lock(mutex_); - CHECK(!allocation_begun_) - << "AddFreeVisitor may not be called after pool allocation " - << "has begun."; - free_visitors_.push_back(visitor); -} - void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) { - return port::AlignedMalloc(num_bytes, static_cast<int>(alignment)); + void* ptr = nullptr; + if (num_bytes > 0) { + ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment)); + VisitAlloc(ptr, numa_node_, num_bytes); + } + return ptr; } void BasicCPUAllocator::Free(void* ptr, size_t num_bytes) { - port::AlignedFree(ptr); + if (num_bytes > 0) { + VisitFree(ptr, numa_node_, num_bytes); + port::AlignedFree(ptr); + } } - } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h index 607734445b..5b4623ba10 100644 --- a/tensorflow/core/common_runtime/pool_allocator.h +++ b/tensorflow/core/common_runtime/pool_allocator.h @@ -16,14 +16,13 @@ limitations under the License. #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_ #define TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_ -// Simple LRU pool allocators for various flavors of CPU RAM that -// implement the VisitableAllocator interface. +// Simple LRU pool allocators for various flavors of CPU RAM. #include <atomic> #include <map> #include <memory> #include <vector> -#include "tensorflow/core/common_runtime/visitable_allocator.h" +#include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/lib/core/bits.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" @@ -41,7 +40,7 @@ class RoundUpInterface { // Size-limited pool of memory buffers obtained from a SubAllocator // instance. Pool eviction policy is LRU. -class PoolAllocator : public VisitableAllocator { +class PoolAllocator : public Allocator { public: // "pool_size_limit" is the maximum number of returned, re-usable // memory buffers to keep in the pool. If pool_size_limit == 0, the @@ -64,14 +63,6 @@ class PoolAllocator : public VisitableAllocator { void DeallocateRaw(void* ptr) override; - // REQUIRES: The following functions may only be called prior - // to the first Allocate*() call. Once allocation has begun, it is - // illegal to register another visitor. - - void AddAllocVisitor(Visitor visitor) override; - - void AddFreeVisitor(Visitor visitor) override; - // Allocate an unused memory region of size "num_bytes". Fetch from // the pool if available, otherwise call allocator_. void* Get(size_t num_bytes); @@ -141,12 +132,6 @@ class PoolAllocator : public VisitableAllocator { int64 put_count_ GUARDED_BY(mutex_) = 0; int64 allocated_count_ GUARDED_BY(mutex_) = 0; int64 evicted_count_ GUARDED_BY(mutex_) = 0; - // Write access to these is guarded by mutex_, but not read - // access. They may only be modified prior to the first - // allocation. Later attempts to modify will fail. - std::vector<Visitor> alloc_visitors_; - std::vector<Visitor> free_visitors_; - std::atomic<bool> allocation_begun_; }; // Do-nothing rounder. Passes through sizes unchanged. @@ -166,7 +151,9 @@ class Pow2Rounder : public RoundUpInterface { class BasicCPUAllocator : public SubAllocator { public: // Argument numa_node is currently ignored. - explicit BasicCPUAllocator(int numa_node) : numa_node_(numa_node) {} + BasicCPUAllocator(int numa_node, const std::vector<Visitor>& alloc_visitors, + const std::vector<Visitor>& free_visitors) + : SubAllocator(alloc_visitors, free_visitors), numa_node_(numa_node) {} ~BasicCPUAllocator() override {} @@ -176,6 +163,8 @@ class BasicCPUAllocator : public SubAllocator { private: int numa_node_; + + TF_DISALLOW_COPY_AND_ASSIGN(BasicCPUAllocator); }; } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc index 447338e7bd..bcaa37fc8a 100644 --- a/tensorflow/core/common_runtime/process_state.cc +++ b/tensorflow/core/common_runtime/process_state.cc @@ -71,20 +71,28 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) { return MemDesc(); } -VisitableAllocator* ProcessState::GetCPUAllocator(int numa_node) { +Allocator* ProcessState::GetCPUAllocator(int numa_node) { CHECK_GE(numa_node, 0); if (!numa_enabled_) numa_node = 0; mutex_lock lock(mu_); while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) { + // If visitors have been defined we need an Allocator built from + // a SubAllocator. Prefer BFCAllocator, but fall back to PoolAllocator + // depending on env var setting. + const bool alloc_visitors_defined = + (!cpu_alloc_visitors_.empty() || !cpu_free_visitors_.empty()); bool use_bfc_allocator = false; - // TODO(reedwm): Switch default to BGFAllocator if it's at least as fast and - // efficient. - Status status = ReadBoolFromEnvVar("TF_CPU_ALLOCATOR_USE_BFC", false, - &use_bfc_allocator); + Status status = ReadBoolFromEnvVar( + "TF_CPU_ALLOCATOR_USE_BFC", alloc_visitors_defined, &use_bfc_allocator); if (!status.ok()) { LOG(ERROR) << "GetCPUAllocator: " << status.error_message(); } - VisitableAllocator* allocator; + Allocator* allocator = nullptr; + SubAllocator* sub_allocator = + (alloc_visitors_defined || use_bfc_allocator) + ? new BasicCPUAllocator(numa_enabled_ ? numa_node : -1, + cpu_alloc_visitors_, cpu_free_visitors_) + : nullptr; if (use_bfc_allocator) { // TODO(reedwm): evaluate whether 64GB by default is the best choice. int64 cpu_mem_limit_in_mb = -1; @@ -95,34 +103,63 @@ VisitableAllocator* ProcessState::GetCPUAllocator(int numa_node) { LOG(ERROR) << "GetCPUAllocator: " << status.error_message(); } int64 cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20); - allocator = new BFCAllocator( - new BasicCPUAllocator(numa_enabled_ ? numa_node : -1), cpu_mem_limit, - true /*allow_growth*/, "bfc_cpu_allocator_for_gpu" /*name*/); + DCHECK(sub_allocator); + allocator = + new BFCAllocator(sub_allocator, cpu_mem_limit, true /*allow_growth*/, + "bfc_cpu_allocator_for_gpu" /*name*/); VLOG(2) << "Using BFCAllocator with memory limit of " << cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator"; - } else { - allocator = new PoolAllocator( - 100 /*pool_size_limit*/, true /*auto_resize*/, - new BasicCPUAllocator(numa_enabled_ ? numa_node : -1), - new NoopRounder, "cpu_pool"); + } else if (alloc_visitors_defined) { + DCHECK(sub_allocator); + allocator = + new PoolAllocator(100 /*pool_size_limit*/, true /*auto_resize*/, + sub_allocator, new NoopRounder, "cpu_pool"); VLOG(2) << "Using PoolAllocator for ProcessState CPU allocator " << "numa_enabled_=" << numa_enabled_ << " numa_node=" << numa_node; + } else { + DCHECK(!sub_allocator); + allocator = cpu_allocator(); } - if (LogMemory::IsEnabled()) { + if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) { // Wrap the allocator to track allocation ids for better logging // at the cost of performance. - allocator = new TrackingVisitableAllocator(allocator, true); + allocator = new TrackingAllocator(allocator, true); } cpu_allocators_.push_back(allocator); + if (!sub_allocator) { + DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty()); + } } return cpu_allocators_[numa_node]; } +void ProcessState::AddCPUAllocVisitor(SubAllocator::Visitor visitor) { + VLOG(1) << "AddCPUAllocVisitor"; + mutex_lock lock(mu_); + CHECK_EQ(0, cpu_allocators_.size()) // Crash OK + << "AddCPUAllocVisitor must be called prior to first call to " + "ProcessState::GetCPUAllocator"; + cpu_alloc_visitors_.push_back(std::move(visitor)); +} + +void ProcessState::AddCPUFreeVisitor(SubAllocator::Visitor visitor) { + mutex_lock lock(mu_); + CHECK_EQ(0, cpu_allocators_.size()) // Crash OK + << "AddCPUFreeVisitor must be called prior to first call to " + "ProcessState::GetCPUAllocator"; + cpu_free_visitors_.push_back(std::move(visitor)); +} + void ProcessState::TestOnlyReset() { mutex_lock lock(mu_); + // Don't delete this value because it's static. + Allocator* default_cpu_allocator = cpu_allocator(); mem_desc_map_.clear(); - gtl::STLDeleteElements(&cpu_allocators_); + for (Allocator* a : cpu_allocators_) { + if (a != default_cpu_allocator) delete a; + } + cpu_allocators_.clear(); gtl::STLDeleteElements(&cpu_al_); } diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h index 2892677333..cac312d849 100644 --- a/tensorflow/core/common_runtime/process_state.h +++ b/tensorflow/core/common_runtime/process_state.h @@ -30,7 +30,6 @@ limitations under the License. namespace tensorflow { class Allocator; -class VisitableAllocator; class PoolAllocator; // Singleton that manages per-process state, e.g. allocation of @@ -65,7 +64,15 @@ class ProcessState { // Returns the one CPUAllocator used for the given numa_node. // TEMPORARY: ignores numa_node. - VisitableAllocator* GetCPUAllocator(int numa_node); + Allocator* GetCPUAllocator(int numa_node); + + // Registers alloc visitor for the CPU allocator(s). + // REQUIRES: must be called before GetCPUAllocator. + void AddCPUAllocVisitor(SubAllocator::Visitor v); + + // Registers free visitor for the CPU allocator(s). + // REQUIRES: must be called before GetCPUAllocator. + void AddCPUFreeVisitor(SubAllocator::Visitor v); typedef std::unordered_map<const void*, MemDesc> MDMap; @@ -87,7 +94,9 @@ class ProcessState { mutex mu_; - std::vector<VisitableAllocator*> cpu_allocators_ GUARDED_BY(mu_); + std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_); + std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_); + std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_); virtual ~ProcessState(); diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h index 103eee03b3..9d59264899 100644 --- a/tensorflow/core/common_runtime/renamed_device.h +++ b/tensorflow/core/common_runtime/renamed_device.h @@ -72,9 +72,10 @@ class RenamedDevice : public Device { return underlying_->MakeGpuDevice(); } - void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, - DeviceContext* dc, Allocator* allocator) override { - underlying_->ReinitializeGpuDevice(context, device, dc, allocator); + Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, + DeviceContext* dc, + Allocator* allocator) override { + return underlying_->ReinitializeGpuDevice(context, device, dc, allocator); } Status MakeTensorFromProto(const TensorProto& tensor_proto, diff --git a/tensorflow/core/common_runtime/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h deleted file mode 100644 index ae0563a96a..0000000000 --- a/tensorflow/core/common_runtime/visitable_allocator.h +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ -#define TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ - -#include <functional> -#include "tensorflow/core/framework/allocator.h" -#include "tensorflow/core/framework/tracking_allocator.h" - -namespace tensorflow { - -// Subclass VisitableAllocator instead of Allocator when a memory -// allocator needs to enable some kind of registration/deregistration -// of memory areas. -class VisitableAllocator : public Allocator { - public: - // Visitor gets called with a pointer to a memory area and its - // size in bytes. - typedef std::function<void(void*, size_t)> Visitor; - - // Register a visitor guaranteed to be called exactly once on each - // chunk of memory newly allocated from the underlying device. - // Typically, chunks will be reused and possibly sub-divided by a - // pool manager, so the calls will happen only once per process - // execution, not once per tensor (re)allocation. - virtual void AddAllocVisitor(Visitor visitor) = 0; - - // Register a visitor guaranteed to be called on each chunk of - // memory returned to the underlying device. - virtual void AddFreeVisitor(Visitor visitor) = 0; -}; - -// Needed for cases when a VisitableAllocator gets wrapped for tracking. -// Multiple-inheritance is considered acceptable in this case because -// VisitableAllocator is a pure virtual interface and only TrackingAllocator -// has default implementation. -class TrackingVisitableAllocator : public TrackingAllocator, - public VisitableAllocator { - public: - TrackingVisitableAllocator(VisitableAllocator* allocator, bool track_ids) - : TrackingAllocator(allocator, track_ids), allocator_(allocator) {} - ~TrackingVisitableAllocator() override {} - - string Name() override { return TrackingAllocator::Name(); } - - void* AllocateRaw(size_t alignment, size_t num_bytes) override { - return TrackingAllocator::AllocateRaw(alignment, num_bytes); - } - - void DeallocateRaw(void* ptr) override { - TrackingAllocator::DeallocateRaw(ptr); - } - - void AddAllocVisitor(Visitor visitor) override { - allocator_->AddAllocVisitor(visitor); - } - - void AddFreeVisitor(Visitor visitor) override { - allocator_->AddFreeVisitor(visitor); - } - - protected: - VisitableAllocator* allocator_; -}; -} // namespace tensorflow -#endif // TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc index 2a7ee16a16..84cee5569c 100644 --- a/tensorflow/core/framework/allocator.cc +++ b/tensorflow/core/framework/allocator.cc @@ -196,7 +196,7 @@ class CPUAllocatorFactory : public AllocatorFactory { class CPUSubAllocator : public SubAllocator { public: explicit CPUSubAllocator(CPUAllocator* cpu_allocator) - : cpu_allocator_(cpu_allocator) {} + : SubAllocator({}, {}), cpu_allocator_(cpu_allocator) {} void* Alloc(size_t alignment, size_t num_bytes) override { return cpu_allocator_->AllocateRaw(alignment, num_bytes); @@ -222,4 +222,22 @@ Allocator* cpu_allocator() { } return cpu_alloc; } + +SubAllocator::SubAllocator(const std::vector<Visitor>& alloc_visitors, + const std::vector<Visitor>& free_visitors) + : alloc_visitors_(alloc_visitors), free_visitors_(free_visitors) {} + +void SubAllocator::VisitAlloc(void* ptr, int index, size_t num_bytes) { + for (const auto& v : alloc_visitors_) { + v(ptr, index, num_bytes); + } +} + +void SubAllocator::VisitFree(void* ptr, int index, size_t num_bytes) { + // Although we don't guarantee any order of visitor application, strive + // to apply free visitors in reverse order of alloc visitors. + for (int i = free_visitors_.size() - 1; i >= 0; --i) { + free_visitors_[i](ptr, index, num_bytes); + } +} } // namespace tensorflow diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h index ded120b704..8c23604625 100644 --- a/tensorflow/core/framework/allocator.h +++ b/tensorflow/core/framework/allocator.h @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/framework/resource_handle.h" #include "tensorflow/core/framework/type_traits.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" namespace tensorflow { @@ -387,13 +388,36 @@ void EnableCPUAllocatorStats(bool enable); // full statistics. By default, it's disabled. void EnableCPUAllocatorFullStats(bool enable); -// Abstract interface of an object that does the underlying suballoc/free of -// memory for a higher-level allocator. +// An object that does the underlying suballoc/free of memory for a higher-level +// allocator. The expectation is that the higher-level allocator is doing some +// kind of cache or pool management so that it will call SubAllocator::Alloc and +// Free relatively infrequently, compared to the number of times its own +// AllocateRaw and Free methods are called. class SubAllocator { public: + // Visitor gets called with a pointer to a memory area and its + // size in bytes. The index value will be numa_node for a CPU + // allocator and GPU id for a GPU allocator. + typedef std::function<void(void*, int index, size_t)> Visitor; + + SubAllocator(const std::vector<Visitor>& alloc_visitors, + const std::vector<Visitor>& free_visitors); + virtual ~SubAllocator() {} virtual void* Alloc(size_t alignment, size_t num_bytes) = 0; virtual void Free(void* ptr, size_t num_bytes) = 0; + + protected: + // Implementation of Alloc() method must call this on newly allocated + // value. + void VisitAlloc(void* ptr, int index, size_t num_bytes); + + // Implementation of Free() method must call this on value to be + // freed immediately before deallocation. + void VisitFree(void* ptr, int index, size_t num_bytes); + + const std::vector<Visitor> alloc_visitors_; + const std::vector<Visitor> free_visitors_; }; } // namespace tensorflow diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h index 794250a2c1..53ac639b4c 100644 --- a/tensorflow/core/framework/device_base.h +++ b/tensorflow/core/framework/device_base.h @@ -214,10 +214,12 @@ class DeviceBase { // This is overridden by GPU devices to reinitialize the derived // type returned by MakeGpuDevice. - virtual void ReinitializeGpuDevice(OpKernelContext* /*context*/, - PerOpGpuDevice* /*device*/, - DeviceContext* /*dc*/, - Allocator* /*allocator*/) {} + virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/, + PerOpGpuDevice* /*device*/, + DeviceContext* /*dc*/, + Allocator* /*allocator*/) { + return Status::OK(); + } // Unimplemented by default virtual const DeviceAttributes& attributes() const; diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index 80f2b12987..3e34bf0418 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -265,9 +265,12 @@ OpKernelContext::OpKernelContext(Params* params, int num_outputs) params_->ensure_eigen_gpu_device(); if (params_->eigen_gpu_device != nullptr) { Allocator* eigen_gpu_allocator = get_allocator(AllocatorAttributes()); - params_->device->ReinitializeGpuDevice(this, params_->eigen_gpu_device, - params_->op_device_context, - eigen_gpu_allocator); + Status s = params_->device->ReinitializeGpuDevice( + this, params_->eigen_gpu_device, params_->op_device_context, + eigen_gpu_allocator); + if (!s.ok()) { + SetStatus(s); + } } if (params_->record_tensor_accesses) { referenced_tensors_.Init(); |