diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2018-09-17 22:09:02 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-09-17 22:13:46 -0700 |
commit | 7c826588b058c14fd8c152bedb4e256c57ae1248 (patch) | |
tree | 7acacce04bca5d86d24969278a3553a96cd1f1c0 /tensorflow/core/common_runtime | |
parent | b91e27a9c33d038af79a0944eb9046b926d483c8 (diff) |
Automated rollback of commit 185aa89912376d4088c22615908696cd30f9951b
PiperOrigin-RevId: 213394522
Diffstat (limited to 'tensorflow/core/common_runtime')
23 files changed, 446 insertions, 502 deletions
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index 3843ea9e60..84c6285bbe 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -31,7 +31,7 @@ namespace tensorflow { BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, bool allow_growth, const string& name) - : sub_allocator_(sub_allocator), + : suballocator_(sub_allocator), name_(name), free_chunks_list_(kInvalidChunkHandle), next_allocation_id_(1) { @@ -72,7 +72,7 @@ BFCAllocator::~BFCAllocator() { VLOG(2) << "Number of regions allocated: " << region_manager_.regions().size(); for (const auto& region : region_manager_.regions()) { - sub_allocator_->Free(region.ptr(), region.memory_size()); + suballocator_->Free(region.ptr(), region.memory_size()); } for (BinNum b = 0; b < kNumBins; b++) { @@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { // Try allocating. size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes); - void* mem_addr = sub_allocator_->Alloc(alignment, bytes); + void* mem_addr = suballocator_->Alloc(alignment, bytes); if (mem_addr == nullptr && !started_backpedal_) { // Only backpedal once. started_backpedal_ = true; @@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { while (mem_addr == nullptr) { bytes = RoundedBytes(bytes * kBackpedalFactor); if (bytes < rounded_bytes) break; - mem_addr = sub_allocator_->Alloc(alignment, bytes); + mem_addr = suballocator_->Alloc(alignment, bytes); } } @@ -158,6 +158,10 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { // Insert the chunk into the right bin. InsertFreeChunkIntoBin(h); + // Invoke visitors on newly allocated region. + for (const auto& visitor : region_visitors_) { + visitor(mem_addr, bytes); + } return true; } @@ -486,6 +490,15 @@ void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) { InsertFreeChunkIntoBin(coalesced_chunk); } +void BFCAllocator::AddAllocVisitor(Visitor visitor) { + VLOG(1) << "AddVisitor"; + mutex_lock l(lock_); + region_visitors_.push_back(visitor); + for (const auto& region : region_manager_.regions()) { + visitor(region.ptr(), region.memory_size()); + } +} + bool BFCAllocator::TracksAllocationSizes() { return true; } size_t BFCAllocator::RequestedSize(const void* ptr) { diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index 364071e066..20e1dab1d5 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -23,7 +23,7 @@ limitations under the License. #include <vector> #include "tensorflow/core/common_runtime/allocator_retry.h" -#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/lib/gtl/stl_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/macros.h" @@ -42,7 +42,7 @@ namespace tensorflow { // coalescing. One assumption we make is that the process using this // allocator owns pretty much all of the memory, and that nearly // all requests to allocate memory go through this interface. -class BFCAllocator : public Allocator { +class BFCAllocator : public VisitableAllocator { public: // Takes ownership of sub_allocator. BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, @@ -55,6 +55,11 @@ class BFCAllocator : public Allocator { const AllocationAttributes& allocation_attr) override; void DeallocateRaw(void* ptr) override; + void AddAllocVisitor(Visitor visitor) override; + + // Does nothing, because memory is never freed. + void AddFreeVisitor(Visitor visitor) override {} + bool TracksAllocationSizes() override; size_t RequestedSize(const void* ptr) override; @@ -418,7 +423,7 @@ class BFCAllocator : public Allocator { // of the available memory. bool started_backpedal_ = false; - std::unique_ptr<SubAllocator> sub_allocator_; + std::unique_ptr<SubAllocator> suballocator_; string name_; // Structures mutable after construction @@ -430,6 +435,9 @@ class BFCAllocator : public Allocator { // Pointer to head of linked list of free Chunks ChunkHandle free_chunks_list_ GUARDED_BY(lock_); + // Called once on each region, ASAP. + std::vector<Visitor> region_visitors_ GUARDED_BY(lock_); + // Counter containing the next unique identifier to assign to a // newly-created chunk. int64 next_allocation_id_ GUARDED_BY(lock_); diff --git a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h index 6bd29ef775..636cd43575 100644 --- a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h +++ b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h @@ -26,12 +26,8 @@ namespace tensorflow { class CUDAHostAllocator : public SubAllocator { public: // Note: stream_exec cannot be null. - explicit CUDAHostAllocator(se::StreamExecutor* stream_exec, int numa_node, - const std::vector<Visitor>& alloc_visitors, - const std::vector<Visitor>& free_visitors) - : SubAllocator(alloc_visitors, free_visitors), - stream_exec_(stream_exec), - numa_node_(numa_node) { + explicit CUDAHostAllocator(se::StreamExecutor* stream_exec) + : stream_exec_(stream_exec) { CHECK(stream_exec_ != nullptr); } ~CUDAHostAllocator() override {} @@ -43,23 +39,19 @@ class CUDAHostAllocator : public SubAllocator { if (ptr == nullptr) { LOG(WARNING) << "could not allocate pinned host memory of size: " << num_bytes; - return ptr; } - VisitAlloc(ptr, numa_node_, num_bytes); } return ptr; } void Free(void* ptr, size_t num_bytes) override { if (ptr != nullptr) { - VisitFree(ptr, numa_node_, num_bytes); stream_exec_->HostMemoryDeallocate(ptr); } } private: se::StreamExecutor* stream_exec_; // not owned, non-null - const int numa_node_; TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator); }; diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc index 44ffce77a1..2d4c8d0201 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc @@ -22,15 +22,18 @@ limitations under the License. namespace tensorflow { -GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator, - size_t total_memory, const string& name) - : GPUBFCAllocator(sub_allocator, total_memory, GPUOptions(), name) {} +GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, + const string& name) + : GPUBFCAllocator(cuda_gpu_id, total_memory, GPUOptions(), name) {} -GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator, - size_t total_memory, +GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, const GPUOptions& gpu_options, const string& name) - : BFCAllocator(sub_allocator, total_memory, gpu_options.allow_growth(), - name) {} + : BFCAllocator( + new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), + gpu_options.per_process_gpu_memory_fraction() > 1.0 || + gpu_options.experimental().use_unified_memory()), + total_memory, gpu_options.allow_growth(), name) {} } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h index 6b6de80734..f1cc2eace1 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h @@ -31,20 +31,28 @@ limitations under the License. namespace tensorflow { -// Suballocator for GPU memory. -class GPUMemAllocator : public SubAllocator { +// A GPU memory allocator that implements a 'best-fit with coalescing' +// algorithm. +class GPUBFCAllocator : public BFCAllocator { public: // 'cuda_gpu_id' refers to the ID of the GPU device within // the process and must reference a valid ID in the process. + GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, + const string& name); + GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, + const GPUOptions& gpu_options, const string& name); + virtual ~GPUBFCAllocator() {} + + TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator); +}; + +// Suballocator for GPU memory. +class GPUMemAllocator : public SubAllocator { + public: // Note: stream_exec cannot be null. - explicit GPUMemAllocator(se::StreamExecutor* stream_exec, CudaGpuId gpu_id, - bool use_unified_memory, - const std::vector<Visitor>& alloc_visitors, - const std::vector<Visitor>& free_visitors) - : SubAllocator(alloc_visitors, free_visitors), - stream_exec_(stream_exec), - gpu_id_(gpu_id), - use_unified_memory_(use_unified_memory) { + explicit GPUMemAllocator(se::StreamExecutor* stream_exec, + bool use_unified_memory) + : stream_exec_(stream_exec), use_unified_memory_(use_unified_memory) { CHECK(stream_exec_ != nullptr); } ~GPUMemAllocator() override {} @@ -57,14 +65,12 @@ class GPUMemAllocator : public SubAllocator { } else { ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque(); } - VisitAlloc(ptr, gpu_id_.value(), num_bytes); } return ptr; } void Free(void* ptr, size_t num_bytes) override { if (ptr != nullptr) { - VisitFree(ptr, gpu_id_.value(), num_bytes); if (use_unified_memory_) { stream_exec_->UnifiedMemoryDeallocate(ptr); } else { @@ -76,25 +82,11 @@ class GPUMemAllocator : public SubAllocator { private: se::StreamExecutor* stream_exec_; // not owned, non-null - const CudaGpuId gpu_id_; const bool use_unified_memory_ = false; TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator); }; -// A GPU memory allocator that implements a 'best-fit with coalescing' -// algorithm. -class GPUBFCAllocator : public BFCAllocator { - public: - GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory, - const string& name); - GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory, - const GPUOptions& gpu_options, const string& name); - ~GPUBFCAllocator() override {} - - TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator); -}; - } // namespace tensorflow #endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc index 7112c3afd4..67caeb3495 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc @@ -21,7 +21,6 @@ limitations under the License. #include <vector> #include "tensorflow/core/common_runtime/gpu/gpu_id.h" -#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h" #include "tensorflow/core/common_runtime/gpu/gpu_init.h" #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/gtl/inlined_vector.h" @@ -47,11 +46,7 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use, } TEST(GPUBFCAllocatorTest, NoDups) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); // Allocate a lot of raw pointers @@ -80,11 +75,7 @@ TEST(GPUBFCAllocatorTest, NoDups) { } TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); // Allocate 256 raw pointers of sizes between 100 bytes and about // a meg random::PhiloxRandom philox(123, 17); @@ -142,11 +133,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { } TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); float* first_ptr = a.Allocate<float>(1024); @@ -181,30 +168,18 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { } TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); float* ptr = a.Allocate<float>(0); EXPECT_EQ(nullptr, ptr); } TEST(GPUBFCAllocatorTest, TracksSizes) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); EXPECT_EQ(true, a.TracksAllocationSizes()); } TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); float* t1 = a.Allocate<float>(1); EXPECT_EQ(4, a.RequestedSize(t1)); EXPECT_EQ(256, a.AllocatedSize(t1)); @@ -212,12 +187,8 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { } TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); // Configure a 1MiB byte limit - GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc"); float* first_ptr = a.Allocate<float>(1 << 6); float* second_ptr = a.Allocate<float>(1 << 20); @@ -232,11 +203,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { options.set_allow_growth(true); // Max of 2GiB, but starts out small. - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc"); // Allocate 10 raw pointers of sizes between 100 bytes and about // 64 megs. @@ -297,15 +264,8 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { } TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc"); - sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); + GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); void* amem = a.AllocateRaw(1, 1); void* bmem = b.AllocateRaw(1, 1 << 30); a.DeallocateRaw(amem); @@ -313,11 +273,7 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { } static void BM_Allocation(int iters) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<size_t> sizes = {256, 4096, 16384, 524288, 512, 1048576, 10485760, 104857600, @@ -333,11 +289,7 @@ static void BM_Allocation(int iters) { BENCHMARK(BM_Allocation); static void BM_AllocationThreaded(int iters, int num_threads) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); thread::ThreadPool pool(Env::Default(), "test", num_threads); std::atomic_int_fast32_t count(iters); mutex done_lock; @@ -373,11 +325,7 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16); // A more complex benchmark that defers deallocation of an object for // "delay" allocations. static void BM_AllocationDelayed(int iters, int delay) { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024}; int size_index = 0; @@ -415,11 +363,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { // only methods inside this class can access private members of BFCAllocator. void TestBinDebugInfo() { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); std::vector<void*> initial_ptrs; std::vector<size_t> initial_ptrs_allocated_sizes; @@ -497,11 +441,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { } void TestLog2FloorNonZeroSlow() { - CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc"); + GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc"); EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0)); EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1)); EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2)); diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc index 8e14f1ea75..934a57a5fb 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc @@ -27,7 +27,7 @@ limitations under the License. namespace tensorflow { -GPUcudaMallocAllocator::GPUcudaMallocAllocator(Allocator* allocator, +GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator, CudaGpuId cuda_gpu_id) : base_allocator_(allocator) { stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -60,6 +60,14 @@ void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) { #endif // GOOGLE_CUDA } +void GPUcudaMallocAllocator::AddAllocVisitor(Visitor visitor) { + return base_allocator_->AddAllocVisitor(visitor); +} + +void GPUcudaMallocAllocator::AddFreeVisitor(Visitor visitor) { + return base_allocator_->AddFreeVisitor(visitor); +} + bool GPUcudaMallocAllocator::TracksAllocationSizes() { return false; } } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h index 3d1d0ef481..856fdc34b4 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h @@ -19,7 +19,7 @@ limitations under the License. #include <memory> #include "tensorflow/core/common_runtime/gpu/gpu_id.h" -#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/types.h" @@ -29,17 +29,20 @@ namespace tensorflow { // An allocator that wraps a GPU allocator and adds debugging // functionality that verifies that users do not write outside their // allocated memory. -class GPUcudaMallocAllocator : public Allocator { +class GPUcudaMallocAllocator : public VisitableAllocator { public: - explicit GPUcudaMallocAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); + explicit GPUcudaMallocAllocator(VisitableAllocator* allocator, + CudaGpuId cuda_gpu_id); ~GPUcudaMallocAllocator() override; string Name() override { return "gpu_debug"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; void DeallocateRaw(void* ptr) override; + void AddAllocVisitor(Visitor visitor) override; + void AddFreeVisitor(Visitor visitor) override; bool TracksAllocationSizes() override; private: - Allocator* base_allocator_ = nullptr; // owned + VisitableAllocator* base_allocator_ = nullptr; // owned se::StreamExecutor* stream_exec_; // Not owned. diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc index 6bad66dcec..e4c834b30d 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc @@ -73,7 +73,7 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) { // ----------------------------------------------------------------------------- // GPUDebugAllocator // ----------------------------------------------------------------------------- -GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator, +GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator, CudaGpuId cuda_gpu_id) : base_allocator_(allocator) { stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -111,6 +111,14 @@ void GPUDebugAllocator::DeallocateRaw(void* ptr) { base_allocator_->DeallocateRaw(ptr); } +void GPUDebugAllocator::AddAllocVisitor(Visitor visitor) { + return base_allocator_->AddAllocVisitor(visitor); +} + +void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) { + return base_allocator_->AddFreeVisitor(visitor); +} + bool GPUDebugAllocator::TracksAllocationSizes() { return true; } size_t GPUDebugAllocator::RequestedSize(const void* ptr) { @@ -150,7 +158,7 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) { // ----------------------------------------------------------------------------- // GPUNanResetAllocator // ----------------------------------------------------------------------------- -GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator, +GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator, CudaGpuId cuda_gpu_id) : base_allocator_(allocator) { stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -192,6 +200,14 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) { base_allocator_->DeallocateRaw(ptr); } +void GPUNanResetAllocator::AddAllocVisitor(Visitor visitor) { + return base_allocator_->AddAllocVisitor(visitor); +} + +void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) { + return base_allocator_->AddFreeVisitor(visitor); +} + size_t GPUNanResetAllocator::RequestedSize(const void* ptr) { return base_allocator_->RequestedSize(ptr); } diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h index 0f27ff4384..0f9b72040c 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h @@ -21,7 +21,7 @@ limitations under the License. #include <unordered_map> #include "tensorflow/core/common_runtime/gpu/gpu_id.h" -#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/types.h" @@ -31,13 +31,16 @@ namespace tensorflow { // An allocator that wraps a GPU allocator and adds debugging // functionality that verifies that users do not write outside their // allocated memory. -class GPUDebugAllocator : public Allocator { +class GPUDebugAllocator : public VisitableAllocator { public: - explicit GPUDebugAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); + explicit GPUDebugAllocator(VisitableAllocator* allocator, + CudaGpuId cuda_gpu_id); ~GPUDebugAllocator() override; string Name() override { return "gpu_debug"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; void DeallocateRaw(void* ptr) override; + void AddAllocVisitor(Visitor visitor) override; + void AddFreeVisitor(Visitor visitor) override; bool TracksAllocationSizes() override; size_t RequestedSize(const void* ptr) override; size_t AllocatedSize(const void* ptr) override; @@ -50,7 +53,7 @@ class GPUDebugAllocator : public Allocator { bool CheckFooter(void* ptr); private: - Allocator* base_allocator_ = nullptr; // owned + VisitableAllocator* base_allocator_ = nullptr; // owned se::StreamExecutor* stream_exec_; // Not owned. @@ -60,20 +63,23 @@ class GPUDebugAllocator : public Allocator { // An allocator that wraps a GPU allocator and resets the memory on // allocation and free to 'NaN', helping to identify cases where the // user forgets to initialize the memory. -class GPUNanResetAllocator : public Allocator { +class GPUNanResetAllocator : public VisitableAllocator { public: - explicit GPUNanResetAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); + explicit GPUNanResetAllocator(VisitableAllocator* allocator, + CudaGpuId cuda_gpu_id); ~GPUNanResetAllocator() override; string Name() override { return "gpu_nan_reset"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; void DeallocateRaw(void* ptr) override; + void AddAllocVisitor(Visitor visitor) override; + void AddFreeVisitor(Visitor visitor) override; size_t RequestedSize(const void* ptr) override; size_t AllocatedSize(const void* ptr) override; void GetStats(AllocatorStats* stats) override; void ClearStats() override; private: - Allocator* base_allocator_ = nullptr; // owned + VisitableAllocator* base_allocator_ = nullptr; // owned se::StreamExecutor* stream_exec_; // Not owned. diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc index 98283cd846..236a0afa0b 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc @@ -35,10 +35,7 @@ namespace { TEST(GPUDebugAllocatorTest, OverwriteDetection_None) { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -62,10 +59,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) { EXPECT_DEATH( { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), - cuda_gpu_id, false /*use_unified_memory*/, {}, {}); - GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -98,10 +92,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) { EXPECT_DEATH( { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), - cuda_gpu_id, false /*use_unified_memory*/, {}, {}); - GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -131,10 +122,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) { TEST(GPUDebugAllocatorTest, ResetToNan) { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + GPUNanResetAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -175,11 +163,8 @@ TEST(GPUDebugAllocatorTest, ResetToNan) { TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { const CudaGpuId cuda_gpu_id(0); // NaN reset must be the outer-most allocator. - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); GPUNanResetAllocator a( - new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id), cuda_gpu_id); auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); @@ -220,21 +205,15 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { TEST(GPUDebugAllocatorTest, TracksSizes) { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); - GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id); EXPECT_EQ(true, a.TracksAllocationSizes()); } TEST(GPUDebugAllocatorTest, AllocatedVsRequested) { const CudaGpuId cuda_gpu_id(0); - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); GPUNanResetAllocator a( - new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), + new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), cuda_gpu_id), cuda_gpu_id); float* t1 = a.Allocate<float>(1); diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index 50e61b7e00..2763ac0d4a 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -41,6 +41,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/gpu/gpu_util.h" #include "tensorflow/core/common_runtime/gpu_device_context.h" #include "tensorflow/core/common_runtime/local_device.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/framework/op_kernel.h" @@ -284,38 +285,6 @@ BaseGPUDevice::~BaseGPUDevice() { for (auto ctx : device_contexts_) ctx->Unref(); } -// This should be idempotent if already initialized. -Status BaseGPUDevice::InitScratchBuffers() { - mutex_lock l(scratch_init_mutex_); - if (scratch_.size() < max_streams_) { - for (int i = 0; i < max_streams_; i++) { - DCHECK(streams_[i]); - if (scratch_.size() > i && scratch_[i]) continue; - size_t scratch_buffer_size = - Eigen::kCudaScratchSize + sizeof(unsigned int); - void* scratch_buffer = gpu_allocator_->AllocateRaw( - Allocator::kAllocatorAlignment, scratch_buffer_size); - if (scratch_buffer == nullptr) { - return errors::FailedPrecondition( - "Failed to allocate scratch buffer for device ", - tf_gpu_id_.value()); - } - se::DeviceMemory<char> mem( - se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size)); - - bool ok = executor_->SynchronousMemZero( - &mem, Eigen::kCudaScratchSize + sizeof(unsigned int)); - if (!ok) { - return errors::FailedPrecondition( - "Failed to memcopy into scratch buffer for device ", - tf_gpu_id_.value()); - } - scratch_.push_back(static_cast<char*>(scratch_buffer)); - } - } - return Status::OK(); -} - Status BaseGPUDevice::Init(const SessionOptions& options) { auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_); if (!executor_status.status().ok()) { @@ -334,6 +303,27 @@ Status BaseGPUDevice::Init(const SessionOptions& options) { for (int i = 0; i < max_streams_; i++) { streams_.push_back(StreamGroupFactory::Global().GetOrCreate( tf_gpu_id_, i, executor_, options.config.gpu_options())); + + size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int); + void* scratch_buffer = gpu_allocator_->AllocateRaw( + Allocator::kAllocatorAlignment, scratch_buffer_size); + if (scratch_buffer == nullptr) { + return errors::FailedPrecondition( + "Failed to allocate scratch buffer for device ", tf_gpu_id_.value()); + } + scratch_.push_back(static_cast<char*>(scratch_buffer)); + + se::DeviceMemory<char> mem( + se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size)); + + bool ok = executor_->SynchronousMemZero( + &mem, Eigen::kCudaScratchSize + sizeof(unsigned int)); + if (!ok) { + return errors::FailedPrecondition( + "Failed to memcopy into scratch buffer for device ", + tf_gpu_id_.value()); + } + device_contexts_.push_back(new GPUDeviceContext( i, streams_.back()->compute, streams_.back()->host_to_device, streams_.back()->device_to_host, streams_.back()->device_to_device)); @@ -877,11 +867,10 @@ PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() { return new ConcretePerOpGpuDevice(); } -Status BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, - PerOpGpuDevice* device, - DeviceContext* dc, - Allocator* allocator) { - TF_RETURN_IF_ERROR(InitScratchBuffers()); +void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, + PerOpGpuDevice* device, + DeviceContext* dc, + Allocator* allocator) { if (dc) { const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc); const int stream_id = gpu_dc->stream_id(); @@ -892,7 +881,6 @@ Status BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, } else { ReinitializeDevice(context, device, 0, allocator); } - return Status::OK(); } Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr, diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h index b3eea55758..56d03d7a8c 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.h +++ b/tensorflow/core/common_runtime/gpu/gpu_device.h @@ -86,9 +86,8 @@ class BaseGPUDevice : public LocalDevice { // The caller owns the returned device. PerOpGpuDevice* MakeGpuDevice() override; - Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, - DeviceContext* dc, - Allocator* allocator) override; + void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, + DeviceContext* dc, Allocator* allocator) override; // Returns the CUDA GPU id of this device within the native driver system; // e.g., for CUDA this is the ordinal of the GPU within the system. @@ -126,7 +125,6 @@ class BaseGPUDevice : public LocalDevice { class StreamGroupFactory; gtl::InlinedVector<StreamGroup*, 4> streams_; - mutex scratch_init_mutex_; gtl::InlinedVector<char*, 4> scratch_; std::vector<GPUDeviceContext*> device_contexts_; GpuDeviceInfo* gpu_device_info_ = nullptr; @@ -137,9 +135,6 @@ class BaseGPUDevice : public LocalDevice { std::unique_ptr<EventMgr> em_; std::unique_ptr<thread::ThreadPool> thread_pool_; - // Initialize scractch buffers used by Eigen. - Status InitScratchBuffers(); - void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device, int stream_id, Allocator* allocator); diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc index 9ec740fabe..b18688174d 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc @@ -76,16 +76,12 @@ GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) { // This function is defined for debugging problems with the allocators. GPUProcessState::~GPUProcessState() { CHECK_EQ(this, instance_); + for (auto p : gpu_allocators_) { + delete p; + } instance_ = nullptr; } -int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) { - // Return the NUMA node associated with the GPU's StreamExecutor. - se::StreamExecutor* se = - GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie(); - return se->GetDeviceDescription().numa_node(); -} - Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, TfGpuId tf_gpu_id, size_t total_bytes) { @@ -97,10 +93,13 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) { gpu_allocators_.resize(tf_gpu_id.value() + 1); + if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) + gpu_al_.resize(tf_gpu_id.value() + 1); } - AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()]; - if (allocator_parts.allocator.get() == nullptr) { + if (gpu_allocators_[tf_gpu_id.value()] == nullptr) { + VisitableAllocator* gpu_allocator; + // Validate allocator types. if (!allocator_type.empty() && allocator_type != "BFC") { LOG(ERROR) << "Invalid allocator type: " << allocator_type; @@ -109,17 +108,8 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, CudaGpuId cuda_gpu_id; TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); - int bus_id = BusIdForGPU(tf_gpu_id); - while (bus_id >= gpu_visitors_.size()) { - gpu_visitors_.push_back({}); - } - GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - (options.per_process_gpu_memory_fraction() > 1.0 || - options.experimental().use_unified_memory()), - gpu_visitors_[bus_id], {}); - Allocator* gpu_allocator = - new GPUBFCAllocator(sub_allocator, total_bytes, options, + gpu_allocator = + new GPUBFCAllocator(cuda_gpu_id, total_bytes, options, strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc")); // If true, checks for memory overwrites by writing @@ -133,25 +123,34 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, // **WARNING** probably will not work in a multi-gpu scenario gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, cuda_gpu_id); } - - Allocator* recording_allocator = nullptr; + gpu_allocators_[tf_gpu_id.value()] = gpu_allocator; + + // If there are any pending AllocVisitors for this bus, add + // them now. + se::StreamExecutor* se = + GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie(); + int bus_id = se->GetDeviceDescription().numa_node(); + if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) { + for (const auto& v : gpu_visitors_[bus_id]) { + gpu_allocator->AddAllocVisitor(v); + } + } if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { ProcessState::MemDesc md; md.loc = ProcessState::MemDesc::GPU; md.dev_index = cuda_gpu_id.value(); md.gpu_registered = false; md.nic_registered = true; - recording_allocator = new internal::RecordingAllocator( + if (static_cast<int64>(gpu_al_.size()) <= tf_gpu_id.value()) { + gpu_al_.resize(tf_gpu_id.value() + 1); + } + gpu_al_[tf_gpu_id.value()] = new internal::RecordingAllocator( &process_state_->mem_desc_map_, gpu_allocator, md, &mu_); } - allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator), sub_allocator, - std::unique_ptr<Allocator>(recording_allocator)}; - } - if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { - return allocator_parts.recording_allocator.get(); - } else { - return allocator_parts.allocator.get(); } + if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) + return gpu_al_[tf_gpu_id.value()]; + return gpu_allocators_[tf_gpu_id.value()]; #else LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda."; return nullptr; @@ -173,12 +172,11 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { tf_shared_lock lock(mu_); if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types && - !cuda_host_allocators_.empty() && - cuda_host_allocators_[0].recording_allocator != nullptr) { - return cuda_host_allocators_[0].recording_allocator.get(); + static_cast<int>(cuda_al_.size()) > 0) { + return cuda_al_[0]; } if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) { - return cuda_host_allocators_[0].allocator.get(); + return cuda_host_allocators_[0]; } } @@ -192,7 +190,7 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { // it knows is valid. se::StreamExecutor* se = nullptr; for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) { - if (gpu_allocators_[i].allocator != nullptr) { + if (gpu_allocators_[i] != nullptr) { se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie(); break; } @@ -201,15 +199,6 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { CHECK_NE(nullptr, se); while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) { - while (cuda_host_alloc_visitors_.size() <= numa_node) { - cuda_host_alloc_visitors_.push_back({}); - } - while (cuda_host_free_visitors_.size() <= numa_node) { - cuda_host_free_visitors_.push_back({}); - } - SubAllocator* sub_allocator = new CUDAHostAllocator( - se, numa_node, cuda_host_alloc_visitors_[numa_node], - cuda_host_free_visitors_[numa_node]); // TODO(zheng-xq): evaluate whether 64GB by default is the best choice. int64 cuda_host_mem_limit_in_mb = -1; Status status = ReadInt64FromEnvVar("TF_CUDA_HOST_MEM_LIMIT_IN_MB", @@ -219,92 +208,62 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) { LOG(ERROR) << "GetCUDAHostAllocator: " << status.error_message(); } int64 cuda_host_mem_limit = cuda_host_mem_limit_in_mb * (1LL << 20); - Allocator* allocator = - new BFCAllocator(sub_allocator, cuda_host_mem_limit, + VisitableAllocator* allocator = + new BFCAllocator(new CUDAHostAllocator(se), cuda_host_mem_limit, true /*allow_growth*/, "cuda_host_bfc" /*name*/); - if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) { + if (LogMemory::IsEnabled()) { // Wrap the allocator to track allocation ids for better logging // at the cost of performance. - allocator = new TrackingAllocator(allocator, true); + allocator = new TrackingVisitableAllocator(allocator, true); } - cuda_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator), - sub_allocator, - std::unique_ptr<Allocator>(nullptr)}); - AllocatorParts& allocator_parts = cuda_host_allocators_.back(); + cuda_host_allocators_.push_back(allocator); if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { ProcessState::MemDesc md; md.loc = ProcessState::MemDesc::CPU; md.dev_index = 0; md.gpu_registered = true; md.nic_registered = false; - allocator_parts.recording_allocator.reset( - new internal::RecordingAllocator(&process_state_->mem_desc_map_, - allocator_parts.allocator.get(), md, - &mu_)); + cuda_al_.push_back(new internal::RecordingAllocator( + &process_state_->mem_desc_map_, cuda_host_allocators_.back(), md, + &mu_)); } } - if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { - return cuda_host_allocators_[0].recording_allocator.get(); - } else { - return cuda_host_allocators_[0].allocator.get(); - } + if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) + return cuda_al_[0]; + return cuda_host_allocators_[0]; } void GPUProcessState::AddGPUAllocVisitor(int bus_id, - const SubAllocator::Visitor& visitor) { + const AllocVisitor& visitor) { + CHECK(process_state_); #if GOOGLE_CUDA mutex_lock lock(mu_); - CHECK(gpu_allocators_.empty()) // Crash OK - << "AddGPUAllocVisitor must be called before " - "first call to GetGPUAllocator."; + for (int i = 0; i < static_cast<int64>(gpu_allocators_.size()); ++i) { + se::StreamExecutor* se = + GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie(); + if (gpu_allocators_[i] && + (se->GetDeviceDescription().numa_node() + 1) == bus_id) { + gpu_allocators_[i]->AddAllocVisitor(visitor); + } + } while (bus_id >= static_cast<int64>(gpu_visitors_.size())) { - gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>()); + gpu_visitors_.push_back(std::vector<AllocVisitor>()); } gpu_visitors_[bus_id].push_back(visitor); #endif // GOOGLE_CUDA } -void GPUProcessState::AddCUDAHostAllocVisitor( - int numa_node, const SubAllocator::Visitor& visitor) { -#if GOOGLE_CUDA - mutex_lock lock(mu_); - CHECK(cuda_host_allocators_.empty()) // Crash OK - << "AddCUDAHostAllocVisitor must be called before " - "first call to GetCUDAHostAllocator."; - while (numa_node >= static_cast<int64>(cuda_host_alloc_visitors_.size())) { - cuda_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>()); - } - cuda_host_alloc_visitors_[numa_node].push_back(visitor); -#endif // GOOGLE_CUDA -} - -void GPUProcessState::AddCUDAHostFreeVisitor( - int numa_node, const SubAllocator::Visitor& visitor) { -#if GOOGLE_CUDA - mutex_lock lock(mu_); - CHECK(cuda_host_allocators_.empty()) // Crash OK - << "AddCUDAHostFreeVisitor must be called before " - "first call to GetCUDAHostAllocator."; - while (numa_node >= static_cast<int64>(cuda_host_free_visitors_.size())) { - cuda_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>()); - } - cuda_host_free_visitors_[numa_node].push_back(visitor); -#endif // GOOGLE_CUDA -} - void GPUProcessState::TestOnlyReset() { - if (process_state_) { - process_state_->ProcessState::TestOnlyReset(); - } + process_state_->ProcessState::TestOnlyReset(); { mutex_lock lock(mu_); gpu_device_enabled_ = false; - gpu_allocators_.clear(); gpu_visitors_.clear(); - cuda_host_allocators_.clear(); - cuda_host_alloc_visitors_.clear(); - cuda_host_free_visitors_.clear(); + gtl::STLDeleteElements(&gpu_allocators_); + gtl::STLDeleteElements(&cuda_host_allocators_); + gtl::STLDeleteElements(&gpu_al_); + gtl::STLDeleteElements(&cuda_al_); } } diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h index 43e9a31660..cb41c3c6bd 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h +++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h @@ -32,6 +32,7 @@ limitations under the License. namespace tensorflow { class Allocator; +class VisitableAllocator; class PoolAllocator; // Singleton that manages per-process state when GPUs are present. @@ -71,30 +72,18 @@ class GPUProcessState { virtual Allocator* GetCUDAHostAllocator(int numa_node); - // Registers a Visitor to be invoked on new chunks of memory allocated by the - // SubAllocator of every GPU proximate to the specified bus. The AllocVisitor - // is provided with a memory pointer, a GPU id, and the size of the area it - // identifies. The pointer is not guaranteed to be valid after the call - // terminates. The intention is for this interface to be used for network - // device memory registration. "bus_id" is platform-specific. On many - // platforms it should be 0. On machines with multiple PCIe buses, it should - // be the index of one of the PCIe buses (maybe the NUMA node at which the - // PCIe is rooted). If the bus_id is invalid, results are undefined. - virtual void AddGPUAllocVisitor(int bus_id, - const SubAllocator::Visitor& visitor); - - // Registers a Visitor to be invoked on new chunks of memory allocated by - // the SubAllocator of the CUDAHostAllocator for the given numa_node. - virtual void AddCUDAHostAllocVisitor(int numa_node, - const SubAllocator::Visitor& visitor); - - // Registers a Visitor to be invoked on each chunk handed back for freeing to - // the SubAllocator of the CUDAHostAllocator for the given numa_node. - virtual void AddCUDAHostFreeVisitor(int numa_node, - const SubAllocator::Visitor& visitor); - - // Returns bus_id for the given GPU id. - virtual int BusIdForGPU(TfGpuId tf_gpu_id); + // Registers a function to be called once on every new Region + // allocated by every GPURegionAllocator proximate to the specified + // bus. The AllocVisitor is provided with a memory pointer and the + // size of the area it identifies. The pointer is not guaranteed to + // be valid after the call terminates. The intention is for this + // interface to be used for network device memory registration. + // "bus_id" is platform-specific. On many platforms it + // should be 0. On machines with multiple PCIe buses, it should be + // the index of one of the PCIe buses. If the bus_id is invalid, + // results are undefined. + typedef std::function<void(void*, size_t)> AllocVisitor; + virtual void AddGPUAllocVisitor(int bus_id, const AllocVisitor& visitor); protected: GPUProcessState(); @@ -114,22 +103,17 @@ class GPUProcessState { mutex mu_; - struct AllocatorParts { - std::unique_ptr<Allocator> allocator; - SubAllocator* sub_allocator; // owned by allocator - std::unique_ptr<Allocator> recording_allocator; - }; - std::vector<AllocatorParts> gpu_allocators_ GUARDED_BY(mu_); - std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_ GUARDED_BY(mu_); - - std::vector<AllocatorParts> cuda_host_allocators_ GUARDED_BY(mu_); - std::vector<std::vector<SubAllocator::Visitor>> cuda_host_alloc_visitors_ - GUARDED_BY(mu_); - std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_ - GUARDED_BY(mu_); + std::vector<VisitableAllocator*> gpu_allocators_ GUARDED_BY(mu_); + std::vector<std::vector<AllocVisitor>> gpu_visitors_ GUARDED_BY(mu_); + std::vector<Allocator*> cuda_host_allocators_ GUARDED_BY(mu_); virtual ~GPUProcessState(); + // Optional RecordingAllocators that wrap the corresponding + // Allocators for runtime attribute use analysis. + std::vector<Allocator*> gpu_al_ GUARDED_BY(mu_); + std::vector<Allocator*> cuda_al_ GUARDED_BY(mu_); + friend class GPUDeviceTest; }; diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc index 6b2f6547b0..583bff2c07 100644 --- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc @@ -31,8 +31,7 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) { 2 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie(), - 0 /*numa_node*/, {}, {}), + .ValueOrDie()), new NoopRounder, "pool"); EXPECT_EQ(nullptr, pool.AllocateRaw(4 /*alignment*/, 0 /*num_bytes*/)); @@ -50,8 +49,7 @@ TEST(PoolAllocatorTest, ZeroSizePool) { 0 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie(), - 0 /*numa_node*/, {}, {}), + .ValueOrDie()), new NoopRounder, "pool"); EXPECT_EQ(0, pool.get_from_pool_count()); @@ -84,8 +82,7 @@ TEST(PoolAllocatorTest, Alignment) { 0 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie(), - 0 /*numa_node*/, {}, {}), + .ValueOrDie()), new NoopRounder, "pool"); for (int i = 0; i < 16; ++i) { size_t alignment = 1 << i; @@ -100,8 +97,8 @@ TEST(PoolAllocatorTest, Alignment) { TEST(PoolAllocatorTest, AutoResize) { PoolAllocator pool(2 /*pool_size_limit*/, true /*auto_resize*/, - new BasicCPUAllocator(0 /*numa_node*/, {}, {}), - new NoopRounder, "pool"); + new BasicCPUAllocator(0 /*numa_node*/), new NoopRounder, + "pool"); // Alloc/dealloc 10 sizes just a few times, confirming pool size // stays at 2. @@ -126,32 +123,14 @@ TEST(PoolAllocatorTest, AutoResize) { } TEST(PoolAllocatorTest, CudaHostAllocator) { - int alloc_count = 0; - int64 alloc_size = 0; - SubAllocator::Visitor alloc_visitor = - [&alloc_count, &alloc_size](void* ptr, int numa_node, int64 size) { - ++alloc_count; - alloc_size += size; - }; - int free_count = 0; - int64 free_size = 0; - SubAllocator::Visitor free_visitor = - [&free_count, &free_size](void* ptr, int numa_node, int64 size) { - ++free_count; - free_size += size; - }; se::Platform* platform = se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); - CUDAHostAllocator* sub_allocator = new CUDAHostAllocator( - platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie(), - 0 /*numa_node*/, {alloc_visitor}, {free_visitor}); - PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/, - sub_allocator, new NoopRounder, "pool"); - EXPECT_EQ(0, alloc_count); - EXPECT_EQ(0, alloc_size); - EXPECT_EQ(0, free_count); - EXPECT_EQ(0, free_size); + PoolAllocator pool( + 2 /*pool_size_limit*/, false /*auto_resize*/, + new CUDAHostAllocator( + platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) + .ValueOrDie()), + new NoopRounder, "pool"); // Repeatedly Get a 16-byte value, confirming that there's only // one real allocation. @@ -159,10 +138,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { EXPECT_EQ(0, pool.get_from_pool_count()); EXPECT_EQ(1, pool.allocated_count()); EXPECT_NE(nullptr, p1_16); - EXPECT_EQ(1, alloc_count); // Underlying suballoc of 16 bytes - // Each suballocation includes a 16B ChunkPrefix. - static const int kChunkPrefixSize = 16; - EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size); pool.DeallocateRaw(p1_16); // Pool contents {16} EXPECT_EQ(1, pool.put_count()); @@ -173,9 +148,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { pool.DeallocateRaw(p2_16); // Put it back. // Pool contents {16} EXPECT_EQ(2, pool.put_count()); - EXPECT_EQ(1, alloc_count); // Underlying suballoc of 16 bytes - EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size); - EXPECT_EQ(0, free_count); // Get two more values of different sizes. void* p3_4 = pool.AllocateRaw(4, 4); @@ -188,9 +160,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { void* p4_2 = pool.AllocateRaw(4, 2); // Get a third size buffer. EXPECT_NE(nullptr, p4_2); EXPECT_EQ(0, pool.evicted_count()); - EXPECT_EQ(3, alloc_count); - EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); - EXPECT_EQ(0, free_count); // The pool is full: when we put back p4_2, the 16-byte buffer // should be evicted since it was least recently inserted. @@ -198,10 +167,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { // Pool contents {2, 4} EXPECT_EQ(4, pool.put_count()); EXPECT_EQ(1, pool.evicted_count()); - EXPECT_EQ(3, alloc_count); - EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); - EXPECT_EQ(1, free_count); - EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size); // Re-getting and putting size 2 or 4 should not alter pool size or // num-evicted. @@ -215,20 +180,12 @@ TEST(PoolAllocatorTest, CudaHostAllocator) { EXPECT_EQ(6, pool.put_count()); EXPECT_EQ(3, pool.allocated_count()); EXPECT_EQ(1, pool.evicted_count()); - EXPECT_EQ(3, alloc_count); - EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); - EXPECT_EQ(1, free_count); - EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size); pool.Clear(); EXPECT_EQ(0, pool.get_from_pool_count()); EXPECT_EQ(0, pool.put_count()); EXPECT_EQ(0, pool.allocated_count()); EXPECT_EQ(0, pool.evicted_count()); - EXPECT_EQ(3, alloc_count); - EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size); - EXPECT_EQ(3, free_count); - EXPECT_EQ(16 + 4 + 2 + (free_count * kChunkPrefixSize), free_size); } TEST(PoolAllocatorTest, Pow2Rounder) { @@ -249,8 +206,7 @@ TEST(PoolAllocatorTest, Name) { 2 /*pool_size_limit*/, false /*auto_resize*/, new CUDAHostAllocator( platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)) - .ValueOrDie(), - 0 /*numa_node*/, {}, {}), + .ValueOrDie()), new NoopRounder, "pool"); EXPECT_EQ("pool", pool.Name()); } diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h index 538a70668a..df9c3a686c 100644 --- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h +++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h @@ -23,11 +23,12 @@ limitations under the License. #include <cstdlib> #include "tensorflow/core/common_runtime/bfc_allocator.h" -#include "tensorflow/core/common_runtime/pool_allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" +#include "tensorflow/core/framework/allocator_registry.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/mem.h" -#include "tensorflow/core/platform/numa.h" +#include "tensorflow/core/platform/mutex.h" #ifndef INTEL_MKL_DNN_ONLY #include "i_malloc.h" @@ -39,16 +40,20 @@ typedef unsigned int uint; namespace tensorflow { -class MklSubAllocator : public BasicCPUAllocator { +class MklSubAllocator : public SubAllocator { public: - MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {} ~MklSubAllocator() override {} + + void* Alloc(size_t alignment, size_t num_bytes) override { + return port::AlignedMalloc(num_bytes, alignment); + } + void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); } }; // CPU allocator that handles small-size allocations by calling // suballocator directly. Mostly, it is just a wrapper around a suballocator // (that calls malloc and free directly) with support for bookkeeping. -class MklSmallSizeAllocator : public Allocator { +class MklSmallSizeAllocator : public VisitableAllocator { public: MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory, const string& name) @@ -70,6 +75,10 @@ class MklSmallSizeAllocator : public Allocator { CHECK(map_.insert(map_val).second); // Increment statistics for small-size allocations. IncrementStats(num_bytes); + // Call alloc visitors. + for (const auto& visitor : alloc_visitors_) { + visitor(ptr, num_bytes); + } } return ptr; } @@ -85,6 +94,9 @@ class MklSmallSizeAllocator : public Allocator { if (map_iter != map_.end()) { // Call free visitors. size_t dealloc_bytes = map_iter->second; + for (const auto& visitor : free_visitors_) { + visitor(ptr, dealloc_bytes); + } sub_allocator_->Free(ptr, dealloc_bytes); DecrementStats(dealloc_bytes); map_.erase(map_iter); @@ -109,6 +121,16 @@ class MklSmallSizeAllocator : public Allocator { stats_.Clear(); } + void AddAllocVisitor(Visitor visitor) override { + mutex_lock l(mutex_); + alloc_visitors_.push_back(visitor); + } + + void AddFreeVisitor(Visitor visitor) override { + mutex_lock l(mutex_); + free_visitors_.push_back(visitor); + } + private: // Increment statistics for the allocator handling small allocations. inline void IncrementStats(size_t alloc_size) @@ -141,11 +163,15 @@ class MklSmallSizeAllocator : public Allocator { // Allocator stats for small allocs AllocatorStats stats_ GUARDED_BY(mutex_); + + // Visitors + std::vector<Visitor> alloc_visitors_ GUARDED_BY(mutex_); + std::vector<Visitor> free_visitors_ GUARDED_BY(mutex_); }; /// CPU allocator for MKL that wraps BFC allocator and intercepts /// and redirects memory allocation calls from MKL. -class MklCPUAllocator : public Allocator { +class MklCPUAllocator : public VisitableAllocator { public: // Constructor and other standard functions @@ -258,6 +284,16 @@ class MklCPUAllocator : public Allocator { large_size_allocator_->ClearStats(); } + void AddAllocVisitor(Visitor visitor) override { + small_size_allocator_->AddAllocVisitor(visitor); + large_size_allocator_->AddAllocVisitor(visitor); + } + + void AddFreeVisitor(Visitor visitor) override { + small_size_allocator_->AddFreeVisitor(visitor); + large_size_allocator_->AddFreeVisitor(visitor); + } + private: // Hooks provided by this allocator for memory allocation routines from MKL @@ -294,7 +330,7 @@ class MklCPUAllocator : public Allocator { // The alignment that we need for the allocations static constexpr const size_t kAlignment = 64; - Allocator* large_size_allocator_; // owned by this class + VisitableAllocator* large_size_allocator_; // owned by this class MklSmallSizeAllocator* small_size_allocator_; // owned by this class. SubAllocator* sub_allocator_; // not owned by this class diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc index 66dc8f3322..fdad8de8d6 100644 --- a/tensorflow/core/common_runtime/pool_allocator.cc +++ b/tensorflow/core/common_runtime/pool_allocator.cc @@ -40,7 +40,8 @@ PoolAllocator::PoolAllocator(size_t pool_size_limit, bool auto_resize, auto_resize_(auto_resize), pool_size_limit_(pool_size_limit), allocator_(allocator), - size_rounder_(size_rounder) { + size_rounder_(size_rounder), + allocation_begun_(false) { if (auto_resize) { CHECK_LT(size_t{0}, pool_size_limit) << "size limit must be > 0 if auto_resize is true."; @@ -92,6 +93,7 @@ ChunkPrefix* FindPrefix(void* user_ptr) { } // namespace void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { + if (!allocation_begun_) allocation_begun_ = true; if (num_bytes == 0) return nullptr; // If alignment is larger than kPoolAlignment, increase num_bytes so that we @@ -127,6 +129,9 @@ void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { return PrepareChunk(r, alignment, num_bytes); } else { void* ptr = allocator_->Alloc(kPoolAlignment, num_bytes); + for (const auto& v : alloc_visitors_) { + v(ptr, num_bytes); + } return PrepareChunk(ptr, alignment, num_bytes); } } @@ -136,6 +141,9 @@ void PoolAllocator::DeallocateRaw(void* ptr) { ChunkPrefix* cp = FindPrefix(ptr); CHECK_LE((void*)cp, (void*)ptr); if (!has_size_limit_ && !auto_resize_) { + for (const auto& v : free_visitors_) { + v(cp, cp->num_bytes); + } allocator_->Free(cp, cp->num_bytes); } else { mutex_lock lock(mutex_); @@ -156,6 +164,9 @@ void PoolAllocator::Clear() { mutex_lock lock(mutex_); for (auto iter : pool_) { PtrRecord* pr = iter.second; + for (const auto& v : free_visitors_) { + v(pr->ptr, pr->num_bytes); + } allocator_->Free(pr->ptr, pr->num_bytes); delete pr; } @@ -210,6 +221,9 @@ void PoolAllocator::EvictOne() { DCHECK(iter != pool_.end()); } pool_.erase(iter); + for (const auto& v : free_visitors_) { + v(prec->ptr, prec->num_bytes); + } allocator_->Free(prec->ptr, prec->num_bytes); delete prec; ++evicted_count_; @@ -255,19 +269,28 @@ void PoolAllocator::EvictOne() { } } +void PoolAllocator::AddAllocVisitor(Visitor visitor) { + mutex_lock lock(mutex_); + CHECK(!allocation_begun_) + << "AddAllocVisitor may not be called after pool allocation " + << "has begun."; + alloc_visitors_.push_back(visitor); +} + +void PoolAllocator::AddFreeVisitor(Visitor visitor) { + mutex_lock lock(mutex_); + CHECK(!allocation_begun_) + << "AddFreeVisitor may not be called after pool allocation " + << "has begun."; + free_visitors_.push_back(visitor); +} + void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) { - void* ptr = nullptr; - if (num_bytes > 0) { - ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment)); - VisitAlloc(ptr, numa_node_, num_bytes); - } - return ptr; + return port::AlignedMalloc(num_bytes, static_cast<int>(alignment)); } void BasicCPUAllocator::Free(void* ptr, size_t num_bytes) { - if (num_bytes > 0) { - VisitFree(ptr, numa_node_, num_bytes); - port::AlignedFree(ptr); - } + port::AlignedFree(ptr); } + } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h index 5b4623ba10..607734445b 100644 --- a/tensorflow/core/common_runtime/pool_allocator.h +++ b/tensorflow/core/common_runtime/pool_allocator.h @@ -16,13 +16,14 @@ limitations under the License. #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_ #define TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_ -// Simple LRU pool allocators for various flavors of CPU RAM. +// Simple LRU pool allocators for various flavors of CPU RAM that +// implement the VisitableAllocator interface. #include <atomic> #include <map> #include <memory> #include <vector> -#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/common_runtime/visitable_allocator.h" #include "tensorflow/core/lib/core/bits.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" @@ -40,7 +41,7 @@ class RoundUpInterface { // Size-limited pool of memory buffers obtained from a SubAllocator // instance. Pool eviction policy is LRU. -class PoolAllocator : public Allocator { +class PoolAllocator : public VisitableAllocator { public: // "pool_size_limit" is the maximum number of returned, re-usable // memory buffers to keep in the pool. If pool_size_limit == 0, the @@ -63,6 +64,14 @@ class PoolAllocator : public Allocator { void DeallocateRaw(void* ptr) override; + // REQUIRES: The following functions may only be called prior + // to the first Allocate*() call. Once allocation has begun, it is + // illegal to register another visitor. + + void AddAllocVisitor(Visitor visitor) override; + + void AddFreeVisitor(Visitor visitor) override; + // Allocate an unused memory region of size "num_bytes". Fetch from // the pool if available, otherwise call allocator_. void* Get(size_t num_bytes); @@ -132,6 +141,12 @@ class PoolAllocator : public Allocator { int64 put_count_ GUARDED_BY(mutex_) = 0; int64 allocated_count_ GUARDED_BY(mutex_) = 0; int64 evicted_count_ GUARDED_BY(mutex_) = 0; + // Write access to these is guarded by mutex_, but not read + // access. They may only be modified prior to the first + // allocation. Later attempts to modify will fail. + std::vector<Visitor> alloc_visitors_; + std::vector<Visitor> free_visitors_; + std::atomic<bool> allocation_begun_; }; // Do-nothing rounder. Passes through sizes unchanged. @@ -151,9 +166,7 @@ class Pow2Rounder : public RoundUpInterface { class BasicCPUAllocator : public SubAllocator { public: // Argument numa_node is currently ignored. - BasicCPUAllocator(int numa_node, const std::vector<Visitor>& alloc_visitors, - const std::vector<Visitor>& free_visitors) - : SubAllocator(alloc_visitors, free_visitors), numa_node_(numa_node) {} + explicit BasicCPUAllocator(int numa_node) : numa_node_(numa_node) {} ~BasicCPUAllocator() override {} @@ -163,8 +176,6 @@ class BasicCPUAllocator : public SubAllocator { private: int numa_node_; - - TF_DISALLOW_COPY_AND_ASSIGN(BasicCPUAllocator); }; } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc index bcaa37fc8a..447338e7bd 100644 --- a/tensorflow/core/common_runtime/process_state.cc +++ b/tensorflow/core/common_runtime/process_state.cc @@ -71,28 +71,20 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) { return MemDesc(); } -Allocator* ProcessState::GetCPUAllocator(int numa_node) { +VisitableAllocator* ProcessState::GetCPUAllocator(int numa_node) { CHECK_GE(numa_node, 0); if (!numa_enabled_) numa_node = 0; mutex_lock lock(mu_); while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) { - // If visitors have been defined we need an Allocator built from - // a SubAllocator. Prefer BFCAllocator, but fall back to PoolAllocator - // depending on env var setting. - const bool alloc_visitors_defined = - (!cpu_alloc_visitors_.empty() || !cpu_free_visitors_.empty()); bool use_bfc_allocator = false; - Status status = ReadBoolFromEnvVar( - "TF_CPU_ALLOCATOR_USE_BFC", alloc_visitors_defined, &use_bfc_allocator); + // TODO(reedwm): Switch default to BGFAllocator if it's at least as fast and + // efficient. + Status status = ReadBoolFromEnvVar("TF_CPU_ALLOCATOR_USE_BFC", false, + &use_bfc_allocator); if (!status.ok()) { LOG(ERROR) << "GetCPUAllocator: " << status.error_message(); } - Allocator* allocator = nullptr; - SubAllocator* sub_allocator = - (alloc_visitors_defined || use_bfc_allocator) - ? new BasicCPUAllocator(numa_enabled_ ? numa_node : -1, - cpu_alloc_visitors_, cpu_free_visitors_) - : nullptr; + VisitableAllocator* allocator; if (use_bfc_allocator) { // TODO(reedwm): evaluate whether 64GB by default is the best choice. int64 cpu_mem_limit_in_mb = -1; @@ -103,63 +95,34 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) { LOG(ERROR) << "GetCPUAllocator: " << status.error_message(); } int64 cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20); - DCHECK(sub_allocator); - allocator = - new BFCAllocator(sub_allocator, cpu_mem_limit, true /*allow_growth*/, - "bfc_cpu_allocator_for_gpu" /*name*/); + allocator = new BFCAllocator( + new BasicCPUAllocator(numa_enabled_ ? numa_node : -1), cpu_mem_limit, + true /*allow_growth*/, "bfc_cpu_allocator_for_gpu" /*name*/); VLOG(2) << "Using BFCAllocator with memory limit of " << cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator"; - } else if (alloc_visitors_defined) { - DCHECK(sub_allocator); - allocator = - new PoolAllocator(100 /*pool_size_limit*/, true /*auto_resize*/, - sub_allocator, new NoopRounder, "cpu_pool"); + } else { + allocator = new PoolAllocator( + 100 /*pool_size_limit*/, true /*auto_resize*/, + new BasicCPUAllocator(numa_enabled_ ? numa_node : -1), + new NoopRounder, "cpu_pool"); VLOG(2) << "Using PoolAllocator for ProcessState CPU allocator " << "numa_enabled_=" << numa_enabled_ << " numa_node=" << numa_node; - } else { - DCHECK(!sub_allocator); - allocator = cpu_allocator(); } - if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) { + if (LogMemory::IsEnabled()) { // Wrap the allocator to track allocation ids for better logging // at the cost of performance. - allocator = new TrackingAllocator(allocator, true); + allocator = new TrackingVisitableAllocator(allocator, true); } cpu_allocators_.push_back(allocator); - if (!sub_allocator) { - DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty()); - } } return cpu_allocators_[numa_node]; } -void ProcessState::AddCPUAllocVisitor(SubAllocator::Visitor visitor) { - VLOG(1) << "AddCPUAllocVisitor"; - mutex_lock lock(mu_); - CHECK_EQ(0, cpu_allocators_.size()) // Crash OK - << "AddCPUAllocVisitor must be called prior to first call to " - "ProcessState::GetCPUAllocator"; - cpu_alloc_visitors_.push_back(std::move(visitor)); -} - -void ProcessState::AddCPUFreeVisitor(SubAllocator::Visitor visitor) { - mutex_lock lock(mu_); - CHECK_EQ(0, cpu_allocators_.size()) // Crash OK - << "AddCPUFreeVisitor must be called prior to first call to " - "ProcessState::GetCPUAllocator"; - cpu_free_visitors_.push_back(std::move(visitor)); -} - void ProcessState::TestOnlyReset() { mutex_lock lock(mu_); - // Don't delete this value because it's static. - Allocator* default_cpu_allocator = cpu_allocator(); mem_desc_map_.clear(); - for (Allocator* a : cpu_allocators_) { - if (a != default_cpu_allocator) delete a; - } - cpu_allocators_.clear(); + gtl::STLDeleteElements(&cpu_allocators_); gtl::STLDeleteElements(&cpu_al_); } diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h index cac312d849..2892677333 100644 --- a/tensorflow/core/common_runtime/process_state.h +++ b/tensorflow/core/common_runtime/process_state.h @@ -30,6 +30,7 @@ limitations under the License. namespace tensorflow { class Allocator; +class VisitableAllocator; class PoolAllocator; // Singleton that manages per-process state, e.g. allocation of @@ -64,15 +65,7 @@ class ProcessState { // Returns the one CPUAllocator used for the given numa_node. // TEMPORARY: ignores numa_node. - Allocator* GetCPUAllocator(int numa_node); - - // Registers alloc visitor for the CPU allocator(s). - // REQUIRES: must be called before GetCPUAllocator. - void AddCPUAllocVisitor(SubAllocator::Visitor v); - - // Registers free visitor for the CPU allocator(s). - // REQUIRES: must be called before GetCPUAllocator. - void AddCPUFreeVisitor(SubAllocator::Visitor v); + VisitableAllocator* GetCPUAllocator(int numa_node); typedef std::unordered_map<const void*, MemDesc> MDMap; @@ -94,9 +87,7 @@ class ProcessState { mutex mu_; - std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_); - std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_); - std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_); + std::vector<VisitableAllocator*> cpu_allocators_ GUARDED_BY(mu_); virtual ~ProcessState(); diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h index 9d59264899..103eee03b3 100644 --- a/tensorflow/core/common_runtime/renamed_device.h +++ b/tensorflow/core/common_runtime/renamed_device.h @@ -72,10 +72,9 @@ class RenamedDevice : public Device { return underlying_->MakeGpuDevice(); } - Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, - DeviceContext* dc, - Allocator* allocator) override { - return underlying_->ReinitializeGpuDevice(context, device, dc, allocator); + void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, + DeviceContext* dc, Allocator* allocator) override { + underlying_->ReinitializeGpuDevice(context, device, dc, allocator); } Status MakeTensorFromProto(const TensorProto& tensor_proto, diff --git a/tensorflow/core/common_runtime/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h new file mode 100644 index 0000000000..ae0563a96a --- /dev/null +++ b/tensorflow/core/common_runtime/visitable_allocator.h @@ -0,0 +1,79 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ + +#include <functional> +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/tracking_allocator.h" + +namespace tensorflow { + +// Subclass VisitableAllocator instead of Allocator when a memory +// allocator needs to enable some kind of registration/deregistration +// of memory areas. +class VisitableAllocator : public Allocator { + public: + // Visitor gets called with a pointer to a memory area and its + // size in bytes. + typedef std::function<void(void*, size_t)> Visitor; + + // Register a visitor guaranteed to be called exactly once on each + // chunk of memory newly allocated from the underlying device. + // Typically, chunks will be reused and possibly sub-divided by a + // pool manager, so the calls will happen only once per process + // execution, not once per tensor (re)allocation. + virtual void AddAllocVisitor(Visitor visitor) = 0; + + // Register a visitor guaranteed to be called on each chunk of + // memory returned to the underlying device. + virtual void AddFreeVisitor(Visitor visitor) = 0; +}; + +// Needed for cases when a VisitableAllocator gets wrapped for tracking. +// Multiple-inheritance is considered acceptable in this case because +// VisitableAllocator is a pure virtual interface and only TrackingAllocator +// has default implementation. +class TrackingVisitableAllocator : public TrackingAllocator, + public VisitableAllocator { + public: + TrackingVisitableAllocator(VisitableAllocator* allocator, bool track_ids) + : TrackingAllocator(allocator, track_ids), allocator_(allocator) {} + ~TrackingVisitableAllocator() override {} + + string Name() override { return TrackingAllocator::Name(); } + + void* AllocateRaw(size_t alignment, size_t num_bytes) override { + return TrackingAllocator::AllocateRaw(alignment, num_bytes); + } + + void DeallocateRaw(void* ptr) override { + TrackingAllocator::DeallocateRaw(ptr); + } + + void AddAllocVisitor(Visitor visitor) override { + allocator_->AddAllocVisitor(visitor); + } + + void AddFreeVisitor(Visitor visitor) override { + allocator_->AddFreeVisitor(visitor); + } + + protected: + VisitableAllocator* allocator_; +}; +} // namespace tensorflow +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_ |