aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/common_runtime
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-09-17 22:09:02 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-09-17 22:13:46 -0700
commit7c826588b058c14fd8c152bedb4e256c57ae1248 (patch)
tree7acacce04bca5d86d24969278a3553a96cd1f1c0 /tensorflow/core/common_runtime
parentb91e27a9c33d038af79a0944eb9046b926d483c8 (diff)
Automated rollback of commit 185aa89912376d4088c22615908696cd30f9951b
PiperOrigin-RevId: 213394522
Diffstat (limited to 'tensorflow/core/common_runtime')
-rw-r--r--tensorflow/core/common_runtime/bfc_allocator.cc21
-rw-r--r--tensorflow/core/common_runtime/bfc_allocator.h14
-rw-r--r--tensorflow/core/common_runtime/gpu/cuda_host_allocator.h12
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc17
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h44
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc90
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc10
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h11
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc20
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h20
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc35
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.cc64
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.h9
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_process_state.cc161
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_process_state.h58
-rw-r--r--tensorflow/core/common_runtime/gpu/pool_allocator_test.cc68
-rw-r--r--tensorflow/core/common_runtime/mkl_cpu_allocator.h50
-rw-r--r--tensorflow/core/common_runtime/pool_allocator.cc45
-rw-r--r--tensorflow/core/common_runtime/pool_allocator.h27
-rw-r--r--tensorflow/core/common_runtime/process_state.cc71
-rw-r--r--tensorflow/core/common_runtime/process_state.h15
-rw-r--r--tensorflow/core/common_runtime/renamed_device.h7
-rw-r--r--tensorflow/core/common_runtime/visitable_allocator.h79
23 files changed, 446 insertions, 502 deletions
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 3843ea9e60..84c6285bbe 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -31,7 +31,7 @@ namespace tensorflow {
BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
bool allow_growth, const string& name)
- : sub_allocator_(sub_allocator),
+ : suballocator_(sub_allocator),
name_(name),
free_chunks_list_(kInvalidChunkHandle),
next_allocation_id_(1) {
@@ -72,7 +72,7 @@ BFCAllocator::~BFCAllocator() {
VLOG(2) << "Number of regions allocated: "
<< region_manager_.regions().size();
for (const auto& region : region_manager_.regions()) {
- sub_allocator_->Free(region.ptr(), region.memory_size());
+ suballocator_->Free(region.ptr(), region.memory_size());
}
for (BinNum b = 0; b < kNumBins; b++) {
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
// Try allocating.
size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
- void* mem_addr = sub_allocator_->Alloc(alignment, bytes);
+ void* mem_addr = suballocator_->Alloc(alignment, bytes);
if (mem_addr == nullptr && !started_backpedal_) {
// Only backpedal once.
started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
while (mem_addr == nullptr) {
bytes = RoundedBytes(bytes * kBackpedalFactor);
if (bytes < rounded_bytes) break;
- mem_addr = sub_allocator_->Alloc(alignment, bytes);
+ mem_addr = suballocator_->Alloc(alignment, bytes);
}
}
@@ -158,6 +158,10 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
// Insert the chunk into the right bin.
InsertFreeChunkIntoBin(h);
+ // Invoke visitors on newly allocated region.
+ for (const auto& visitor : region_visitors_) {
+ visitor(mem_addr, bytes);
+ }
return true;
}
@@ -486,6 +490,15 @@ void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) {
InsertFreeChunkIntoBin(coalesced_chunk);
}
+void BFCAllocator::AddAllocVisitor(Visitor visitor) {
+ VLOG(1) << "AddVisitor";
+ mutex_lock l(lock_);
+ region_visitors_.push_back(visitor);
+ for (const auto& region : region_manager_.regions()) {
+ visitor(region.ptr(), region.memory_size());
+ }
+}
+
bool BFCAllocator::TracksAllocationSizes() { return true; }
size_t BFCAllocator::RequestedSize(const void* ptr) {
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 364071e066..20e1dab1d5 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -23,7 +23,7 @@ limitations under the License.
#include <vector>
#include "tensorflow/core/common_runtime/allocator_retry.h"
-#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
#include "tensorflow/core/lib/gtl/stl_util.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/macros.h"
@@ -42,7 +42,7 @@ namespace tensorflow {
// coalescing. One assumption we make is that the process using this
// allocator owns pretty much all of the memory, and that nearly
// all requests to allocate memory go through this interface.
-class BFCAllocator : public Allocator {
+class BFCAllocator : public VisitableAllocator {
public:
// Takes ownership of sub_allocator.
BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
@@ -55,6 +55,11 @@ class BFCAllocator : public Allocator {
const AllocationAttributes& allocation_attr) override;
void DeallocateRaw(void* ptr) override;
+ void AddAllocVisitor(Visitor visitor) override;
+
+ // Does nothing, because memory is never freed.
+ void AddFreeVisitor(Visitor visitor) override {}
+
bool TracksAllocationSizes() override;
size_t RequestedSize(const void* ptr) override;
@@ -418,7 +423,7 @@ class BFCAllocator : public Allocator {
// of the available memory.
bool started_backpedal_ = false;
- std::unique_ptr<SubAllocator> sub_allocator_;
+ std::unique_ptr<SubAllocator> suballocator_;
string name_;
// Structures mutable after construction
@@ -430,6 +435,9 @@ class BFCAllocator : public Allocator {
// Pointer to head of linked list of free Chunks
ChunkHandle free_chunks_list_ GUARDED_BY(lock_);
+ // Called once on each region, ASAP.
+ std::vector<Visitor> region_visitors_ GUARDED_BY(lock_);
+
// Counter containing the next unique identifier to assign to a
// newly-created chunk.
int64 next_allocation_id_ GUARDED_BY(lock_);
diff --git a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
index 6bd29ef775..636cd43575 100644
--- a/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/cuda_host_allocator.h
@@ -26,12 +26,8 @@ namespace tensorflow {
class CUDAHostAllocator : public SubAllocator {
public:
// Note: stream_exec cannot be null.
- explicit CUDAHostAllocator(se::StreamExecutor* stream_exec, int numa_node,
- const std::vector<Visitor>& alloc_visitors,
- const std::vector<Visitor>& free_visitors)
- : SubAllocator(alloc_visitors, free_visitors),
- stream_exec_(stream_exec),
- numa_node_(numa_node) {
+ explicit CUDAHostAllocator(se::StreamExecutor* stream_exec)
+ : stream_exec_(stream_exec) {
CHECK(stream_exec_ != nullptr);
}
~CUDAHostAllocator() override {}
@@ -43,23 +39,19 @@ class CUDAHostAllocator : public SubAllocator {
if (ptr == nullptr) {
LOG(WARNING) << "could not allocate pinned host memory of size: "
<< num_bytes;
- return ptr;
}
- VisitAlloc(ptr, numa_node_, num_bytes);
}
return ptr;
}
void Free(void* ptr, size_t num_bytes) override {
if (ptr != nullptr) {
- VisitFree(ptr, numa_node_, num_bytes);
stream_exec_->HostMemoryDeallocate(ptr);
}
}
private:
se::StreamExecutor* stream_exec_; // not owned, non-null
- const int numa_node_;
TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
};
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index 44ffce77a1..2d4c8d0201 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -22,15 +22,18 @@ limitations under the License.
namespace tensorflow {
-GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator,
- size_t total_memory, const string& name)
- : GPUBFCAllocator(sub_allocator, total_memory, GPUOptions(), name) {}
+GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+ const string& name)
+ : GPUBFCAllocator(cuda_gpu_id, total_memory, GPUOptions(), name) {}
-GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator,
- size_t total_memory,
+GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
const GPUOptions& gpu_options,
const string& name)
- : BFCAllocator(sub_allocator, total_memory, gpu_options.allow_growth(),
- name) {}
+ : BFCAllocator(
+ new GPUMemAllocator(
+ GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(),
+ gpu_options.per_process_gpu_memory_fraction() > 1.0 ||
+ gpu_options.experimental().use_unified_memory()),
+ total_memory, gpu_options.allow_growth(), name) {}
} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index 6b6de80734..f1cc2eace1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -31,20 +31,28 @@ limitations under the License.
namespace tensorflow {
-// Suballocator for GPU memory.
-class GPUMemAllocator : public SubAllocator {
+// A GPU memory allocator that implements a 'best-fit with coalescing'
+// algorithm.
+class GPUBFCAllocator : public BFCAllocator {
public:
// 'cuda_gpu_id' refers to the ID of the GPU device within
// the process and must reference a valid ID in the process.
+ GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+ const string& name);
+ GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+ const GPUOptions& gpu_options, const string& name);
+ virtual ~GPUBFCAllocator() {}
+
+ TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
+};
+
+// Suballocator for GPU memory.
+class GPUMemAllocator : public SubAllocator {
+ public:
// Note: stream_exec cannot be null.
- explicit GPUMemAllocator(se::StreamExecutor* stream_exec, CudaGpuId gpu_id,
- bool use_unified_memory,
- const std::vector<Visitor>& alloc_visitors,
- const std::vector<Visitor>& free_visitors)
- : SubAllocator(alloc_visitors, free_visitors),
- stream_exec_(stream_exec),
- gpu_id_(gpu_id),
- use_unified_memory_(use_unified_memory) {
+ explicit GPUMemAllocator(se::StreamExecutor* stream_exec,
+ bool use_unified_memory)
+ : stream_exec_(stream_exec), use_unified_memory_(use_unified_memory) {
CHECK(stream_exec_ != nullptr);
}
~GPUMemAllocator() override {}
@@ -57,14 +65,12 @@ class GPUMemAllocator : public SubAllocator {
} else {
ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque();
}
- VisitAlloc(ptr, gpu_id_.value(), num_bytes);
}
return ptr;
}
void Free(void* ptr, size_t num_bytes) override {
if (ptr != nullptr) {
- VisitFree(ptr, gpu_id_.value(), num_bytes);
if (use_unified_memory_) {
stream_exec_->UnifiedMemoryDeallocate(ptr);
} else {
@@ -76,25 +82,11 @@ class GPUMemAllocator : public SubAllocator {
private:
se::StreamExecutor* stream_exec_; // not owned, non-null
- const CudaGpuId gpu_id_;
const bool use_unified_memory_ = false;
TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator);
};
-// A GPU memory allocator that implements a 'best-fit with coalescing'
-// algorithm.
-class GPUBFCAllocator : public BFCAllocator {
- public:
- GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory,
- const string& name);
- GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory,
- const GPUOptions& gpu_options, const string& name);
- ~GPUBFCAllocator() override {}
-
- TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
-};
-
} // namespace tensorflow
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 7112c3afd4..67caeb3495 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
#include <vector>
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/lib/core/threadpool.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -47,11 +46,7 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
}
TEST(GPUBFCAllocatorTest, NoDups) {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
CheckStats(&a, 0, 0, 0, 0);
// Allocate a lot of raw pointers
@@ -80,11 +75,7 @@ TEST(GPUBFCAllocatorTest, NoDups) {
}
TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
// Allocate 256 raw pointers of sizes between 100 bytes and about
// a meg
random::PhiloxRandom philox(123, 17);
@@ -142,11 +133,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
}
TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
CheckStats(&a, 0, 0, 0, 0);
float* first_ptr = a.Allocate<float>(1024);
@@ -181,30 +168,18 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
}
TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
float* ptr = a.Allocate<float>(0);
EXPECT_EQ(nullptr, ptr);
}
TEST(GPUBFCAllocatorTest, TracksSizes) {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
EXPECT_EQ(true, a.TracksAllocationSizes());
}
TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
float* t1 = a.Allocate<float>(1);
EXPECT_EQ(4, a.RequestedSize(t1));
EXPECT_EQ(256, a.AllocatedSize(t1));
@@ -212,12 +187,8 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
}
TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
// Configure a 1MiB byte limit
- GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc");
float* first_ptr = a.Allocate<float>(1 << 6);
float* second_ptr = a.Allocate<float>(1 << 20);
@@ -232,11 +203,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
options.set_allow_growth(true);
// Max of 2GiB, but starts out small.
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc");
// Allocate 10 raw pointers of sizes between 100 bytes and about
// 64 megs.
@@ -297,15 +264,8 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
}
TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc");
- sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
+ GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
void* amem = a.AllocateRaw(1, 1);
void* bmem = b.AllocateRaw(1, 1 << 30);
a.DeallocateRaw(amem);
@@ -313,11 +273,7 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
}
static void BM_Allocation(int iters) {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
// Exercise a few different allocation sizes
std::vector<size_t> sizes = {256, 4096, 16384, 524288,
512, 1048576, 10485760, 104857600,
@@ -333,11 +289,7 @@ static void BM_Allocation(int iters) {
BENCHMARK(BM_Allocation);
static void BM_AllocationThreaded(int iters, int num_threads) {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
thread::ThreadPool pool(Env::Default(), "test", num_threads);
std::atomic_int_fast32_t count(iters);
mutex done_lock;
@@ -373,11 +325,7 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16);
// A more complex benchmark that defers deallocation of an object for
// "delay" allocations.
static void BM_AllocationDelayed(int iters, int delay) {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
// Exercise a few different allocation sizes
std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024};
int size_index = 0;
@@ -415,11 +363,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
// only methods inside this class can access private members of BFCAllocator.
void TestBinDebugInfo() {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
std::vector<void*> initial_ptrs;
std::vector<size_t> initial_ptrs_allocated_sizes;
@@ -497,11 +441,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
}
void TestLog2FloorNonZeroSlow() {
- CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc");
+ GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc");
EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0));
EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1));
EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 8e14f1ea75..934a57a5fb 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -27,7 +27,7 @@ limitations under the License.
namespace tensorflow {
-GPUcudaMallocAllocator::GPUcudaMallocAllocator(Allocator* allocator,
+GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator,
CudaGpuId cuda_gpu_id)
: base_allocator_(allocator) {
stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -60,6 +60,14 @@ void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) {
#endif // GOOGLE_CUDA
}
+void GPUcudaMallocAllocator::AddAllocVisitor(Visitor visitor) {
+ return base_allocator_->AddAllocVisitor(visitor);
+}
+
+void GPUcudaMallocAllocator::AddFreeVisitor(Visitor visitor) {
+ return base_allocator_->AddFreeVisitor(visitor);
+}
+
bool GPUcudaMallocAllocator::TracksAllocationSizes() { return false; }
} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 3d1d0ef481..856fdc34b4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -19,7 +19,7 @@ limitations under the License.
#include <memory>
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/types.h"
@@ -29,17 +29,20 @@ namespace tensorflow {
// An allocator that wraps a GPU allocator and adds debugging
// functionality that verifies that users do not write outside their
// allocated memory.
-class GPUcudaMallocAllocator : public Allocator {
+class GPUcudaMallocAllocator : public VisitableAllocator {
public:
- explicit GPUcudaMallocAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id);
+ explicit GPUcudaMallocAllocator(VisitableAllocator* allocator,
+ CudaGpuId cuda_gpu_id);
~GPUcudaMallocAllocator() override;
string Name() override { return "gpu_debug"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
void DeallocateRaw(void* ptr) override;
+ void AddAllocVisitor(Visitor visitor) override;
+ void AddFreeVisitor(Visitor visitor) override;
bool TracksAllocationSizes() override;
private:
- Allocator* base_allocator_ = nullptr; // owned
+ VisitableAllocator* base_allocator_ = nullptr; // owned
se::StreamExecutor* stream_exec_; // Not owned.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 6bad66dcec..e4c834b30d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -73,7 +73,7 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
// -----------------------------------------------------------------------------
// GPUDebugAllocator
// -----------------------------------------------------------------------------
-GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
+GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator,
CudaGpuId cuda_gpu_id)
: base_allocator_(allocator) {
stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -111,6 +111,14 @@ void GPUDebugAllocator::DeallocateRaw(void* ptr) {
base_allocator_->DeallocateRaw(ptr);
}
+void GPUDebugAllocator::AddAllocVisitor(Visitor visitor) {
+ return base_allocator_->AddAllocVisitor(visitor);
+}
+
+void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) {
+ return base_allocator_->AddFreeVisitor(visitor);
+}
+
bool GPUDebugAllocator::TracksAllocationSizes() { return true; }
size_t GPUDebugAllocator::RequestedSize(const void* ptr) {
@@ -150,7 +158,7 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) {
// -----------------------------------------------------------------------------
// GPUNanResetAllocator
// -----------------------------------------------------------------------------
-GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator,
+GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator,
CudaGpuId cuda_gpu_id)
: base_allocator_(allocator) {
stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -192,6 +200,14 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
base_allocator_->DeallocateRaw(ptr);
}
+void GPUNanResetAllocator::AddAllocVisitor(Visitor visitor) {
+ return base_allocator_->AddAllocVisitor(visitor);
+}
+
+void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) {
+ return base_allocator_->AddFreeVisitor(visitor);
+}
+
size_t GPUNanResetAllocator::RequestedSize(const void* ptr) {
return base_allocator_->RequestedSize(ptr);
}
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 0f27ff4384..0f9b72040c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -21,7 +21,7 @@ limitations under the License.
#include <unordered_map>
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/types.h"
@@ -31,13 +31,16 @@ namespace tensorflow {
// An allocator that wraps a GPU allocator and adds debugging
// functionality that verifies that users do not write outside their
// allocated memory.
-class GPUDebugAllocator : public Allocator {
+class GPUDebugAllocator : public VisitableAllocator {
public:
- explicit GPUDebugAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id);
+ explicit GPUDebugAllocator(VisitableAllocator* allocator,
+ CudaGpuId cuda_gpu_id);
~GPUDebugAllocator() override;
string Name() override { return "gpu_debug"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
void DeallocateRaw(void* ptr) override;
+ void AddAllocVisitor(Visitor visitor) override;
+ void AddFreeVisitor(Visitor visitor) override;
bool TracksAllocationSizes() override;
size_t RequestedSize(const void* ptr) override;
size_t AllocatedSize(const void* ptr) override;
@@ -50,7 +53,7 @@ class GPUDebugAllocator : public Allocator {
bool CheckFooter(void* ptr);
private:
- Allocator* base_allocator_ = nullptr; // owned
+ VisitableAllocator* base_allocator_ = nullptr; // owned
se::StreamExecutor* stream_exec_; // Not owned.
@@ -60,20 +63,23 @@ class GPUDebugAllocator : public Allocator {
// An allocator that wraps a GPU allocator and resets the memory on
// allocation and free to 'NaN', helping to identify cases where the
// user forgets to initialize the memory.
-class GPUNanResetAllocator : public Allocator {
+class GPUNanResetAllocator : public VisitableAllocator {
public:
- explicit GPUNanResetAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id);
+ explicit GPUNanResetAllocator(VisitableAllocator* allocator,
+ CudaGpuId cuda_gpu_id);
~GPUNanResetAllocator() override;
string Name() override { return "gpu_nan_reset"; }
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
void DeallocateRaw(void* ptr) override;
+ void AddAllocVisitor(Visitor visitor) override;
+ void AddFreeVisitor(Visitor visitor) override;
size_t RequestedSize(const void* ptr) override;
size_t AllocatedSize(const void* ptr) override;
void GetStats(AllocatorStats* stats) override;
void ClearStats() override;
private:
- Allocator* base_allocator_ = nullptr; // owned
+ VisitableAllocator* base_allocator_ = nullptr; // owned
se::StreamExecutor* stream_exec_; // Not owned.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index 98283cd846..236a0afa0b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -35,10 +35,7 @@ namespace {
TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
const CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
+ GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
cuda_gpu_id);
auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -62,10 +59,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
EXPECT_DEATH(
{
const CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(),
- cuda_gpu_id, false /*use_unified_memory*/, {}, {});
- GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
+ GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
cuda_gpu_id);
auto stream_exec =
GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -98,10 +92,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
EXPECT_DEATH(
{
const CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(),
- cuda_gpu_id, false /*use_unified_memory*/, {}, {});
- GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
+ GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
cuda_gpu_id);
auto stream_exec =
GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -131,10 +122,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
TEST(GPUDebugAllocatorTest, ResetToNan) {
const CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
+ GPUNanResetAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
cuda_gpu_id);
auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -175,11 +163,8 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
const CudaGpuId cuda_gpu_id(0);
// NaN reset must be the outer-most allocator.
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
GPUNanResetAllocator a(
- new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
+ new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
cuda_gpu_id),
cuda_gpu_id);
auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
@@ -220,21 +205,15 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
TEST(GPUDebugAllocatorTest, TracksSizes) {
const CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
- GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
+ GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
cuda_gpu_id);
EXPECT_EQ(true, a.TracksAllocationSizes());
}
TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
const CudaGpuId cuda_gpu_id(0);
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- false /*use_unified_memory*/, {}, {});
GPUNanResetAllocator a(
- new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""),
+ new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
cuda_gpu_id),
cuda_gpu_id);
float* t1 = a.Allocate<float>(1);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 50e61b7e00..2763ac0d4a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -41,6 +41,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
#include "tensorflow/core/common_runtime/gpu_device_context.h"
#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/framework/device_base.h"
#include "tensorflow/core/framework/op_kernel.h"
@@ -284,38 +285,6 @@ BaseGPUDevice::~BaseGPUDevice() {
for (auto ctx : device_contexts_) ctx->Unref();
}
-// This should be idempotent if already initialized.
-Status BaseGPUDevice::InitScratchBuffers() {
- mutex_lock l(scratch_init_mutex_);
- if (scratch_.size() < max_streams_) {
- for (int i = 0; i < max_streams_; i++) {
- DCHECK(streams_[i]);
- if (scratch_.size() > i && scratch_[i]) continue;
- size_t scratch_buffer_size =
- Eigen::kCudaScratchSize + sizeof(unsigned int);
- void* scratch_buffer = gpu_allocator_->AllocateRaw(
- Allocator::kAllocatorAlignment, scratch_buffer_size);
- if (scratch_buffer == nullptr) {
- return errors::FailedPrecondition(
- "Failed to allocate scratch buffer for device ",
- tf_gpu_id_.value());
- }
- se::DeviceMemory<char> mem(
- se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
-
- bool ok = executor_->SynchronousMemZero(
- &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
- if (!ok) {
- return errors::FailedPrecondition(
- "Failed to memcopy into scratch buffer for device ",
- tf_gpu_id_.value());
- }
- scratch_.push_back(static_cast<char*>(scratch_buffer));
- }
- }
- return Status::OK();
-}
-
Status BaseGPUDevice::Init(const SessionOptions& options) {
auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_);
if (!executor_status.status().ok()) {
@@ -334,6 +303,27 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
for (int i = 0; i < max_streams_; i++) {
streams_.push_back(StreamGroupFactory::Global().GetOrCreate(
tf_gpu_id_, i, executor_, options.config.gpu_options()));
+
+ size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
+ void* scratch_buffer = gpu_allocator_->AllocateRaw(
+ Allocator::kAllocatorAlignment, scratch_buffer_size);
+ if (scratch_buffer == nullptr) {
+ return errors::FailedPrecondition(
+ "Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
+ }
+ scratch_.push_back(static_cast<char*>(scratch_buffer));
+
+ se::DeviceMemory<char> mem(
+ se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
+
+ bool ok = executor_->SynchronousMemZero(
+ &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
+ if (!ok) {
+ return errors::FailedPrecondition(
+ "Failed to memcopy into scratch buffer for device ",
+ tf_gpu_id_.value());
+ }
+
device_contexts_.push_back(new GPUDeviceContext(
i, streams_.back()->compute, streams_.back()->host_to_device,
streams_.back()->device_to_host, streams_.back()->device_to_device));
@@ -877,11 +867,10 @@ PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() {
return new ConcretePerOpGpuDevice();
}
-Status BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
- PerOpGpuDevice* device,
- DeviceContext* dc,
- Allocator* allocator) {
- TF_RETURN_IF_ERROR(InitScratchBuffers());
+void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
+ PerOpGpuDevice* device,
+ DeviceContext* dc,
+ Allocator* allocator) {
if (dc) {
const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
const int stream_id = gpu_dc->stream_id();
@@ -892,7 +881,6 @@ Status BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
} else {
ReinitializeDevice(context, device, 0, allocator);
}
- return Status::OK();
}
Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index b3eea55758..56d03d7a8c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -86,9 +86,8 @@ class BaseGPUDevice : public LocalDevice {
// The caller owns the returned device.
PerOpGpuDevice* MakeGpuDevice() override;
- Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
- DeviceContext* dc,
- Allocator* allocator) override;
+ void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
+ DeviceContext* dc, Allocator* allocator) override;
// Returns the CUDA GPU id of this device within the native driver system;
// e.g., for CUDA this is the ordinal of the GPU within the system.
@@ -126,7 +125,6 @@ class BaseGPUDevice : public LocalDevice {
class StreamGroupFactory;
gtl::InlinedVector<StreamGroup*, 4> streams_;
- mutex scratch_init_mutex_;
gtl::InlinedVector<char*, 4> scratch_;
std::vector<GPUDeviceContext*> device_contexts_;
GpuDeviceInfo* gpu_device_info_ = nullptr;
@@ -137,9 +135,6 @@ class BaseGPUDevice : public LocalDevice {
std::unique_ptr<EventMgr> em_;
std::unique_ptr<thread::ThreadPool> thread_pool_;
- // Initialize scractch buffers used by Eigen.
- Status InitScratchBuffers();
-
void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
int stream_id, Allocator* allocator);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 9ec740fabe..b18688174d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -76,16 +76,12 @@ GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
// This function is defined for debugging problems with the allocators.
GPUProcessState::~GPUProcessState() {
CHECK_EQ(this, instance_);
+ for (auto p : gpu_allocators_) {
+ delete p;
+ }
instance_ = nullptr;
}
-int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
- // Return the NUMA node associated with the GPU's StreamExecutor.
- se::StreamExecutor* se =
- GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
- return se->GetDeviceDescription().numa_node();
-}
-
Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
TfGpuId tf_gpu_id,
size_t total_bytes) {
@@ -97,10 +93,13 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
gpu_allocators_.resize(tf_gpu_id.value() + 1);
+ if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types)
+ gpu_al_.resize(tf_gpu_id.value() + 1);
}
- AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
- if (allocator_parts.allocator.get() == nullptr) {
+ if (gpu_allocators_[tf_gpu_id.value()] == nullptr) {
+ VisitableAllocator* gpu_allocator;
+
// Validate allocator types.
if (!allocator_type.empty() && allocator_type != "BFC") {
LOG(ERROR) << "Invalid allocator type: " << allocator_type;
@@ -109,17 +108,8 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
CudaGpuId cuda_gpu_id;
TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id));
- int bus_id = BusIdForGPU(tf_gpu_id);
- while (bus_id >= gpu_visitors_.size()) {
- gpu_visitors_.push_back({});
- }
- GPUMemAllocator* sub_allocator = new GPUMemAllocator(
- GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
- (options.per_process_gpu_memory_fraction() > 1.0 ||
- options.experimental().use_unified_memory()),
- gpu_visitors_[bus_id], {});
- Allocator* gpu_allocator =
- new GPUBFCAllocator(sub_allocator, total_bytes, options,
+ gpu_allocator =
+ new GPUBFCAllocator(cuda_gpu_id, total_bytes, options,
strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
// If true, checks for memory overwrites by writing
@@ -133,25 +123,34 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
// **WARNING** probably will not work in a multi-gpu scenario
gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, cuda_gpu_id);
}
-
- Allocator* recording_allocator = nullptr;
+ gpu_allocators_[tf_gpu_id.value()] = gpu_allocator;
+
+ // If there are any pending AllocVisitors for this bus, add
+ // them now.
+ se::StreamExecutor* se =
+ GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
+ int bus_id = se->GetDeviceDescription().numa_node();
+ if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
+ for (const auto& v : gpu_visitors_[bus_id]) {
+ gpu_allocator->AddAllocVisitor(v);
+ }
+ }
if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
ProcessState::MemDesc md;
md.loc = ProcessState::MemDesc::GPU;
md.dev_index = cuda_gpu_id.value();
md.gpu_registered = false;
md.nic_registered = true;
- recording_allocator = new internal::RecordingAllocator(
+ if (static_cast<int64>(gpu_al_.size()) <= tf_gpu_id.value()) {
+ gpu_al_.resize(tf_gpu_id.value() + 1);
+ }
+ gpu_al_[tf_gpu_id.value()] = new internal::RecordingAllocator(
&process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
}
- allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator), sub_allocator,
- std::unique_ptr<Allocator>(recording_allocator)};
- }
- if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
- return allocator_parts.recording_allocator.get();
- } else {
- return allocator_parts.allocator.get();
}
+ if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types)
+ return gpu_al_[tf_gpu_id.value()];
+ return gpu_allocators_[tf_gpu_id.value()];
#else
LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda.";
return nullptr;
@@ -173,12 +172,11 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
tf_shared_lock lock(mu_);
if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
- !cuda_host_allocators_.empty() &&
- cuda_host_allocators_[0].recording_allocator != nullptr) {
- return cuda_host_allocators_[0].recording_allocator.get();
+ static_cast<int>(cuda_al_.size()) > 0) {
+ return cuda_al_[0];
}
if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
- return cuda_host_allocators_[0].allocator.get();
+ return cuda_host_allocators_[0];
}
}
@@ -192,7 +190,7 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
// it knows is valid.
se::StreamExecutor* se = nullptr;
for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
- if (gpu_allocators_[i].allocator != nullptr) {
+ if (gpu_allocators_[i] != nullptr) {
se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
break;
}
@@ -201,15 +199,6 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
CHECK_NE(nullptr, se);
while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) {
- while (cuda_host_alloc_visitors_.size() <= numa_node) {
- cuda_host_alloc_visitors_.push_back({});
- }
- while (cuda_host_free_visitors_.size() <= numa_node) {
- cuda_host_free_visitors_.push_back({});
- }
- SubAllocator* sub_allocator = new CUDAHostAllocator(
- se, numa_node, cuda_host_alloc_visitors_[numa_node],
- cuda_host_free_visitors_[numa_node]);
// TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
int64 cuda_host_mem_limit_in_mb = -1;
Status status = ReadInt64FromEnvVar("TF_CUDA_HOST_MEM_LIMIT_IN_MB",
@@ -219,92 +208,62 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
LOG(ERROR) << "GetCUDAHostAllocator: " << status.error_message();
}
int64 cuda_host_mem_limit = cuda_host_mem_limit_in_mb * (1LL << 20);
- Allocator* allocator =
- new BFCAllocator(sub_allocator, cuda_host_mem_limit,
+ VisitableAllocator* allocator =
+ new BFCAllocator(new CUDAHostAllocator(se), cuda_host_mem_limit,
true /*allow_growth*/, "cuda_host_bfc" /*name*/);
- if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
+ if (LogMemory::IsEnabled()) {
// Wrap the allocator to track allocation ids for better logging
// at the cost of performance.
- allocator = new TrackingAllocator(allocator, true);
+ allocator = new TrackingVisitableAllocator(allocator, true);
}
- cuda_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
- sub_allocator,
- std::unique_ptr<Allocator>(nullptr)});
- AllocatorParts& allocator_parts = cuda_host_allocators_.back();
+ cuda_host_allocators_.push_back(allocator);
if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
ProcessState::MemDesc md;
md.loc = ProcessState::MemDesc::CPU;
md.dev_index = 0;
md.gpu_registered = true;
md.nic_registered = false;
- allocator_parts.recording_allocator.reset(
- new internal::RecordingAllocator(&process_state_->mem_desc_map_,
- allocator_parts.allocator.get(), md,
- &mu_));
+ cuda_al_.push_back(new internal::RecordingAllocator(
+ &process_state_->mem_desc_map_, cuda_host_allocators_.back(), md,
+ &mu_));
}
}
- if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
- return cuda_host_allocators_[0].recording_allocator.get();
- } else {
- return cuda_host_allocators_[0].allocator.get();
- }
+ if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types)
+ return cuda_al_[0];
+ return cuda_host_allocators_[0];
}
void GPUProcessState::AddGPUAllocVisitor(int bus_id,
- const SubAllocator::Visitor& visitor) {
+ const AllocVisitor& visitor) {
+ CHECK(process_state_);
#if GOOGLE_CUDA
mutex_lock lock(mu_);
- CHECK(gpu_allocators_.empty()) // Crash OK
- << "AddGPUAllocVisitor must be called before "
- "first call to GetGPUAllocator.";
+ for (int i = 0; i < static_cast<int64>(gpu_allocators_.size()); ++i) {
+ se::StreamExecutor* se =
+ GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
+ if (gpu_allocators_[i] &&
+ (se->GetDeviceDescription().numa_node() + 1) == bus_id) {
+ gpu_allocators_[i]->AddAllocVisitor(visitor);
+ }
+ }
while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
- gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
+ gpu_visitors_.push_back(std::vector<AllocVisitor>());
}
gpu_visitors_[bus_id].push_back(visitor);
#endif // GOOGLE_CUDA
}
-void GPUProcessState::AddCUDAHostAllocVisitor(
- int numa_node, const SubAllocator::Visitor& visitor) {
-#if GOOGLE_CUDA
- mutex_lock lock(mu_);
- CHECK(cuda_host_allocators_.empty()) // Crash OK
- << "AddCUDAHostAllocVisitor must be called before "
- "first call to GetCUDAHostAllocator.";
- while (numa_node >= static_cast<int64>(cuda_host_alloc_visitors_.size())) {
- cuda_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
- }
- cuda_host_alloc_visitors_[numa_node].push_back(visitor);
-#endif // GOOGLE_CUDA
-}
-
-void GPUProcessState::AddCUDAHostFreeVisitor(
- int numa_node, const SubAllocator::Visitor& visitor) {
-#if GOOGLE_CUDA
- mutex_lock lock(mu_);
- CHECK(cuda_host_allocators_.empty()) // Crash OK
- << "AddCUDAHostFreeVisitor must be called before "
- "first call to GetCUDAHostAllocator.";
- while (numa_node >= static_cast<int64>(cuda_host_free_visitors_.size())) {
- cuda_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
- }
- cuda_host_free_visitors_[numa_node].push_back(visitor);
-#endif // GOOGLE_CUDA
-}
-
void GPUProcessState::TestOnlyReset() {
- if (process_state_) {
- process_state_->ProcessState::TestOnlyReset();
- }
+ process_state_->ProcessState::TestOnlyReset();
{
mutex_lock lock(mu_);
gpu_device_enabled_ = false;
- gpu_allocators_.clear();
gpu_visitors_.clear();
- cuda_host_allocators_.clear();
- cuda_host_alloc_visitors_.clear();
- cuda_host_free_visitors_.clear();
+ gtl::STLDeleteElements(&gpu_allocators_);
+ gtl::STLDeleteElements(&cuda_host_allocators_);
+ gtl::STLDeleteElements(&gpu_al_);
+ gtl::STLDeleteElements(&cuda_al_);
}
}
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index 43e9a31660..cb41c3c6bd 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -32,6 +32,7 @@ limitations under the License.
namespace tensorflow {
class Allocator;
+class VisitableAllocator;
class PoolAllocator;
// Singleton that manages per-process state when GPUs are present.
@@ -71,30 +72,18 @@ class GPUProcessState {
virtual Allocator* GetCUDAHostAllocator(int numa_node);
- // Registers a Visitor to be invoked on new chunks of memory allocated by the
- // SubAllocator of every GPU proximate to the specified bus. The AllocVisitor
- // is provided with a memory pointer, a GPU id, and the size of the area it
- // identifies. The pointer is not guaranteed to be valid after the call
- // terminates. The intention is for this interface to be used for network
- // device memory registration. "bus_id" is platform-specific. On many
- // platforms it should be 0. On machines with multiple PCIe buses, it should
- // be the index of one of the PCIe buses (maybe the NUMA node at which the
- // PCIe is rooted). If the bus_id is invalid, results are undefined.
- virtual void AddGPUAllocVisitor(int bus_id,
- const SubAllocator::Visitor& visitor);
-
- // Registers a Visitor to be invoked on new chunks of memory allocated by
- // the SubAllocator of the CUDAHostAllocator for the given numa_node.
- virtual void AddCUDAHostAllocVisitor(int numa_node,
- const SubAllocator::Visitor& visitor);
-
- // Registers a Visitor to be invoked on each chunk handed back for freeing to
- // the SubAllocator of the CUDAHostAllocator for the given numa_node.
- virtual void AddCUDAHostFreeVisitor(int numa_node,
- const SubAllocator::Visitor& visitor);
-
- // Returns bus_id for the given GPU id.
- virtual int BusIdForGPU(TfGpuId tf_gpu_id);
+ // Registers a function to be called once on every new Region
+ // allocated by every GPURegionAllocator proximate to the specified
+ // bus. The AllocVisitor is provided with a memory pointer and the
+ // size of the area it identifies. The pointer is not guaranteed to
+ // be valid after the call terminates. The intention is for this
+ // interface to be used for network device memory registration.
+ // "bus_id" is platform-specific. On many platforms it
+ // should be 0. On machines with multiple PCIe buses, it should be
+ // the index of one of the PCIe buses. If the bus_id is invalid,
+ // results are undefined.
+ typedef std::function<void(void*, size_t)> AllocVisitor;
+ virtual void AddGPUAllocVisitor(int bus_id, const AllocVisitor& visitor);
protected:
GPUProcessState();
@@ -114,22 +103,17 @@ class GPUProcessState {
mutex mu_;
- struct AllocatorParts {
- std::unique_ptr<Allocator> allocator;
- SubAllocator* sub_allocator; // owned by allocator
- std::unique_ptr<Allocator> recording_allocator;
- };
- std::vector<AllocatorParts> gpu_allocators_ GUARDED_BY(mu_);
- std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_ GUARDED_BY(mu_);
-
- std::vector<AllocatorParts> cuda_host_allocators_ GUARDED_BY(mu_);
- std::vector<std::vector<SubAllocator::Visitor>> cuda_host_alloc_visitors_
- GUARDED_BY(mu_);
- std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_
- GUARDED_BY(mu_);
+ std::vector<VisitableAllocator*> gpu_allocators_ GUARDED_BY(mu_);
+ std::vector<std::vector<AllocVisitor>> gpu_visitors_ GUARDED_BY(mu_);
+ std::vector<Allocator*> cuda_host_allocators_ GUARDED_BY(mu_);
virtual ~GPUProcessState();
+ // Optional RecordingAllocators that wrap the corresponding
+ // Allocators for runtime attribute use analysis.
+ std::vector<Allocator*> gpu_al_ GUARDED_BY(mu_);
+ std::vector<Allocator*> cuda_al_ GUARDED_BY(mu_);
+
friend class GPUDeviceTest;
};
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
index 6b2f6547b0..583bff2c07 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -31,8 +31,7 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) {
2 /*pool_size_limit*/, false /*auto_resize*/,
new CUDAHostAllocator(
platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
- .ValueOrDie(),
- 0 /*numa_node*/, {}, {}),
+ .ValueOrDie()),
new NoopRounder, "pool");
EXPECT_EQ(nullptr, pool.AllocateRaw(4 /*alignment*/, 0 /*num_bytes*/));
@@ -50,8 +49,7 @@ TEST(PoolAllocatorTest, ZeroSizePool) {
0 /*pool_size_limit*/, false /*auto_resize*/,
new CUDAHostAllocator(
platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
- .ValueOrDie(),
- 0 /*numa_node*/, {}, {}),
+ .ValueOrDie()),
new NoopRounder, "pool");
EXPECT_EQ(0, pool.get_from_pool_count());
@@ -84,8 +82,7 @@ TEST(PoolAllocatorTest, Alignment) {
0 /*pool_size_limit*/, false /*auto_resize*/,
new CUDAHostAllocator(
platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
- .ValueOrDie(),
- 0 /*numa_node*/, {}, {}),
+ .ValueOrDie()),
new NoopRounder, "pool");
for (int i = 0; i < 16; ++i) {
size_t alignment = 1 << i;
@@ -100,8 +97,8 @@ TEST(PoolAllocatorTest, Alignment) {
TEST(PoolAllocatorTest, AutoResize) {
PoolAllocator pool(2 /*pool_size_limit*/, true /*auto_resize*/,
- new BasicCPUAllocator(0 /*numa_node*/, {}, {}),
- new NoopRounder, "pool");
+ new BasicCPUAllocator(0 /*numa_node*/), new NoopRounder,
+ "pool");
// Alloc/dealloc 10 sizes just a few times, confirming pool size
// stays at 2.
@@ -126,32 +123,14 @@ TEST(PoolAllocatorTest, AutoResize) {
}
TEST(PoolAllocatorTest, CudaHostAllocator) {
- int alloc_count = 0;
- int64 alloc_size = 0;
- SubAllocator::Visitor alloc_visitor =
- [&alloc_count, &alloc_size](void* ptr, int numa_node, int64 size) {
- ++alloc_count;
- alloc_size += size;
- };
- int free_count = 0;
- int64 free_size = 0;
- SubAllocator::Visitor free_visitor =
- [&free_count, &free_size](void* ptr, int numa_node, int64 size) {
- ++free_count;
- free_size += size;
- };
se::Platform* platform =
se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
- CUDAHostAllocator* sub_allocator = new CUDAHostAllocator(
- platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
- .ValueOrDie(),
- 0 /*numa_node*/, {alloc_visitor}, {free_visitor});
- PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/,
- sub_allocator, new NoopRounder, "pool");
- EXPECT_EQ(0, alloc_count);
- EXPECT_EQ(0, alloc_size);
- EXPECT_EQ(0, free_count);
- EXPECT_EQ(0, free_size);
+ PoolAllocator pool(
+ 2 /*pool_size_limit*/, false /*auto_resize*/,
+ new CUDAHostAllocator(
+ platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
+ .ValueOrDie()),
+ new NoopRounder, "pool");
// Repeatedly Get a 16-byte value, confirming that there's only
// one real allocation.
@@ -159,10 +138,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
EXPECT_EQ(0, pool.get_from_pool_count());
EXPECT_EQ(1, pool.allocated_count());
EXPECT_NE(nullptr, p1_16);
- EXPECT_EQ(1, alloc_count); // Underlying suballoc of 16 bytes
- // Each suballocation includes a 16B ChunkPrefix.
- static const int kChunkPrefixSize = 16;
- EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size);
pool.DeallocateRaw(p1_16);
// Pool contents {16}
EXPECT_EQ(1, pool.put_count());
@@ -173,9 +148,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
pool.DeallocateRaw(p2_16); // Put it back.
// Pool contents {16}
EXPECT_EQ(2, pool.put_count());
- EXPECT_EQ(1, alloc_count); // Underlying suballoc of 16 bytes
- EXPECT_EQ(16 + (alloc_count * kChunkPrefixSize), alloc_size);
- EXPECT_EQ(0, free_count);
// Get two more values of different sizes.
void* p3_4 = pool.AllocateRaw(4, 4);
@@ -188,9 +160,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
void* p4_2 = pool.AllocateRaw(4, 2); // Get a third size buffer.
EXPECT_NE(nullptr, p4_2);
EXPECT_EQ(0, pool.evicted_count());
- EXPECT_EQ(3, alloc_count);
- EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
- EXPECT_EQ(0, free_count);
// The pool is full: when we put back p4_2, the 16-byte buffer
// should be evicted since it was least recently inserted.
@@ -198,10 +167,6 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
// Pool contents {2, 4}
EXPECT_EQ(4, pool.put_count());
EXPECT_EQ(1, pool.evicted_count());
- EXPECT_EQ(3, alloc_count);
- EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
- EXPECT_EQ(1, free_count);
- EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size);
// Re-getting and putting size 2 or 4 should not alter pool size or
// num-evicted.
@@ -215,20 +180,12 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
EXPECT_EQ(6, pool.put_count());
EXPECT_EQ(3, pool.allocated_count());
EXPECT_EQ(1, pool.evicted_count());
- EXPECT_EQ(3, alloc_count);
- EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
- EXPECT_EQ(1, free_count);
- EXPECT_EQ(16 + (free_count * kChunkPrefixSize), free_size);
pool.Clear();
EXPECT_EQ(0, pool.get_from_pool_count());
EXPECT_EQ(0, pool.put_count());
EXPECT_EQ(0, pool.allocated_count());
EXPECT_EQ(0, pool.evicted_count());
- EXPECT_EQ(3, alloc_count);
- EXPECT_EQ(16 + 4 + 2 + (alloc_count * kChunkPrefixSize), alloc_size);
- EXPECT_EQ(3, free_count);
- EXPECT_EQ(16 + 4 + 2 + (free_count * kChunkPrefixSize), free_size);
}
TEST(PoolAllocatorTest, Pow2Rounder) {
@@ -249,8 +206,7 @@ TEST(PoolAllocatorTest, Name) {
2 /*pool_size_limit*/, false /*auto_resize*/,
new CUDAHostAllocator(
platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
- .ValueOrDie(),
- 0 /*numa_node*/, {}, {}),
+ .ValueOrDie()),
new NoopRounder, "pool");
EXPECT_EQ("pool", pool.Name());
}
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 538a70668a..df9c3a686c 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -23,11 +23,12 @@ limitations under the License.
#include <cstdlib>
#include "tensorflow/core/common_runtime/bfc_allocator.h"
-#include "tensorflow/core/common_runtime/pool_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator_registry.h"
#include "tensorflow/core/lib/strings/numbers.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/platform/numa.h"
+#include "tensorflow/core/platform/mutex.h"
#ifndef INTEL_MKL_DNN_ONLY
#include "i_malloc.h"
@@ -39,16 +40,20 @@ typedef unsigned int uint;
namespace tensorflow {
-class MklSubAllocator : public BasicCPUAllocator {
+class MklSubAllocator : public SubAllocator {
public:
- MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {}
~MklSubAllocator() override {}
+
+ void* Alloc(size_t alignment, size_t num_bytes) override {
+ return port::AlignedMalloc(num_bytes, alignment);
+ }
+ void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
};
// CPU allocator that handles small-size allocations by calling
// suballocator directly. Mostly, it is just a wrapper around a suballocator
// (that calls malloc and free directly) with support for bookkeeping.
-class MklSmallSizeAllocator : public Allocator {
+class MklSmallSizeAllocator : public VisitableAllocator {
public:
MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory,
const string& name)
@@ -70,6 +75,10 @@ class MklSmallSizeAllocator : public Allocator {
CHECK(map_.insert(map_val).second);
// Increment statistics for small-size allocations.
IncrementStats(num_bytes);
+ // Call alloc visitors.
+ for (const auto& visitor : alloc_visitors_) {
+ visitor(ptr, num_bytes);
+ }
}
return ptr;
}
@@ -85,6 +94,9 @@ class MklSmallSizeAllocator : public Allocator {
if (map_iter != map_.end()) {
// Call free visitors.
size_t dealloc_bytes = map_iter->second;
+ for (const auto& visitor : free_visitors_) {
+ visitor(ptr, dealloc_bytes);
+ }
sub_allocator_->Free(ptr, dealloc_bytes);
DecrementStats(dealloc_bytes);
map_.erase(map_iter);
@@ -109,6 +121,16 @@ class MklSmallSizeAllocator : public Allocator {
stats_.Clear();
}
+ void AddAllocVisitor(Visitor visitor) override {
+ mutex_lock l(mutex_);
+ alloc_visitors_.push_back(visitor);
+ }
+
+ void AddFreeVisitor(Visitor visitor) override {
+ mutex_lock l(mutex_);
+ free_visitors_.push_back(visitor);
+ }
+
private:
// Increment statistics for the allocator handling small allocations.
inline void IncrementStats(size_t alloc_size)
@@ -141,11 +163,15 @@ class MklSmallSizeAllocator : public Allocator {
// Allocator stats for small allocs
AllocatorStats stats_ GUARDED_BY(mutex_);
+
+ // Visitors
+ std::vector<Visitor> alloc_visitors_ GUARDED_BY(mutex_);
+ std::vector<Visitor> free_visitors_ GUARDED_BY(mutex_);
};
/// CPU allocator for MKL that wraps BFC allocator and intercepts
/// and redirects memory allocation calls from MKL.
-class MklCPUAllocator : public Allocator {
+class MklCPUAllocator : public VisitableAllocator {
public:
// Constructor and other standard functions
@@ -258,6 +284,16 @@ class MklCPUAllocator : public Allocator {
large_size_allocator_->ClearStats();
}
+ void AddAllocVisitor(Visitor visitor) override {
+ small_size_allocator_->AddAllocVisitor(visitor);
+ large_size_allocator_->AddAllocVisitor(visitor);
+ }
+
+ void AddFreeVisitor(Visitor visitor) override {
+ small_size_allocator_->AddFreeVisitor(visitor);
+ large_size_allocator_->AddFreeVisitor(visitor);
+ }
+
private:
// Hooks provided by this allocator for memory allocation routines from MKL
@@ -294,7 +330,7 @@ class MklCPUAllocator : public Allocator {
// The alignment that we need for the allocations
static constexpr const size_t kAlignment = 64;
- Allocator* large_size_allocator_; // owned by this class
+ VisitableAllocator* large_size_allocator_; // owned by this class
MklSmallSizeAllocator* small_size_allocator_; // owned by this class.
SubAllocator* sub_allocator_; // not owned by this class
diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc
index 66dc8f3322..fdad8de8d6 100644
--- a/tensorflow/core/common_runtime/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/pool_allocator.cc
@@ -40,7 +40,8 @@ PoolAllocator::PoolAllocator(size_t pool_size_limit, bool auto_resize,
auto_resize_(auto_resize),
pool_size_limit_(pool_size_limit),
allocator_(allocator),
- size_rounder_(size_rounder) {
+ size_rounder_(size_rounder),
+ allocation_begun_(false) {
if (auto_resize) {
CHECK_LT(size_t{0}, pool_size_limit)
<< "size limit must be > 0 if auto_resize is true.";
@@ -92,6 +93,7 @@ ChunkPrefix* FindPrefix(void* user_ptr) {
} // namespace
void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
+ if (!allocation_begun_) allocation_begun_ = true;
if (num_bytes == 0) return nullptr;
// If alignment is larger than kPoolAlignment, increase num_bytes so that we
@@ -127,6 +129,9 @@ void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
return PrepareChunk(r, alignment, num_bytes);
} else {
void* ptr = allocator_->Alloc(kPoolAlignment, num_bytes);
+ for (const auto& v : alloc_visitors_) {
+ v(ptr, num_bytes);
+ }
return PrepareChunk(ptr, alignment, num_bytes);
}
}
@@ -136,6 +141,9 @@ void PoolAllocator::DeallocateRaw(void* ptr) {
ChunkPrefix* cp = FindPrefix(ptr);
CHECK_LE((void*)cp, (void*)ptr);
if (!has_size_limit_ && !auto_resize_) {
+ for (const auto& v : free_visitors_) {
+ v(cp, cp->num_bytes);
+ }
allocator_->Free(cp, cp->num_bytes);
} else {
mutex_lock lock(mutex_);
@@ -156,6 +164,9 @@ void PoolAllocator::Clear() {
mutex_lock lock(mutex_);
for (auto iter : pool_) {
PtrRecord* pr = iter.second;
+ for (const auto& v : free_visitors_) {
+ v(pr->ptr, pr->num_bytes);
+ }
allocator_->Free(pr->ptr, pr->num_bytes);
delete pr;
}
@@ -210,6 +221,9 @@ void PoolAllocator::EvictOne() {
DCHECK(iter != pool_.end());
}
pool_.erase(iter);
+ for (const auto& v : free_visitors_) {
+ v(prec->ptr, prec->num_bytes);
+ }
allocator_->Free(prec->ptr, prec->num_bytes);
delete prec;
++evicted_count_;
@@ -255,19 +269,28 @@ void PoolAllocator::EvictOne() {
}
}
+void PoolAllocator::AddAllocVisitor(Visitor visitor) {
+ mutex_lock lock(mutex_);
+ CHECK(!allocation_begun_)
+ << "AddAllocVisitor may not be called after pool allocation "
+ << "has begun.";
+ alloc_visitors_.push_back(visitor);
+}
+
+void PoolAllocator::AddFreeVisitor(Visitor visitor) {
+ mutex_lock lock(mutex_);
+ CHECK(!allocation_begun_)
+ << "AddFreeVisitor may not be called after pool allocation "
+ << "has begun.";
+ free_visitors_.push_back(visitor);
+}
+
void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) {
- void* ptr = nullptr;
- if (num_bytes > 0) {
- ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
- VisitAlloc(ptr, numa_node_, num_bytes);
- }
- return ptr;
+ return port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
}
void BasicCPUAllocator::Free(void* ptr, size_t num_bytes) {
- if (num_bytes > 0) {
- VisitFree(ptr, numa_node_, num_bytes);
- port::AlignedFree(ptr);
- }
+ port::AlignedFree(ptr);
}
+
} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 5b4623ba10..607734445b 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -16,13 +16,14 @@ limitations under the License.
#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_
#define TENSORFLOW_CORE_COMMON_RUNTIME_POOL_ALLOCATOR_H_
-// Simple LRU pool allocators for various flavors of CPU RAM.
+// Simple LRU pool allocators for various flavors of CPU RAM that
+// implement the VisitableAllocator interface.
#include <atomic>
#include <map>
#include <memory>
#include <vector>
-#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
#include "tensorflow/core/lib/core/bits.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
@@ -40,7 +41,7 @@ class RoundUpInterface {
// Size-limited pool of memory buffers obtained from a SubAllocator
// instance. Pool eviction policy is LRU.
-class PoolAllocator : public Allocator {
+class PoolAllocator : public VisitableAllocator {
public:
// "pool_size_limit" is the maximum number of returned, re-usable
// memory buffers to keep in the pool. If pool_size_limit == 0, the
@@ -63,6 +64,14 @@ class PoolAllocator : public Allocator {
void DeallocateRaw(void* ptr) override;
+ // REQUIRES: The following functions may only be called prior
+ // to the first Allocate*() call. Once allocation has begun, it is
+ // illegal to register another visitor.
+
+ void AddAllocVisitor(Visitor visitor) override;
+
+ void AddFreeVisitor(Visitor visitor) override;
+
// Allocate an unused memory region of size "num_bytes". Fetch from
// the pool if available, otherwise call allocator_.
void* Get(size_t num_bytes);
@@ -132,6 +141,12 @@ class PoolAllocator : public Allocator {
int64 put_count_ GUARDED_BY(mutex_) = 0;
int64 allocated_count_ GUARDED_BY(mutex_) = 0;
int64 evicted_count_ GUARDED_BY(mutex_) = 0;
+ // Write access to these is guarded by mutex_, but not read
+ // access. They may only be modified prior to the first
+ // allocation. Later attempts to modify will fail.
+ std::vector<Visitor> alloc_visitors_;
+ std::vector<Visitor> free_visitors_;
+ std::atomic<bool> allocation_begun_;
};
// Do-nothing rounder. Passes through sizes unchanged.
@@ -151,9 +166,7 @@ class Pow2Rounder : public RoundUpInterface {
class BasicCPUAllocator : public SubAllocator {
public:
// Argument numa_node is currently ignored.
- BasicCPUAllocator(int numa_node, const std::vector<Visitor>& alloc_visitors,
- const std::vector<Visitor>& free_visitors)
- : SubAllocator(alloc_visitors, free_visitors), numa_node_(numa_node) {}
+ explicit BasicCPUAllocator(int numa_node) : numa_node_(numa_node) {}
~BasicCPUAllocator() override {}
@@ -163,8 +176,6 @@ class BasicCPUAllocator : public SubAllocator {
private:
int numa_node_;
-
- TF_DISALLOW_COPY_AND_ASSIGN(BasicCPUAllocator);
};
} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index bcaa37fc8a..447338e7bd 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -71,28 +71,20 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
return MemDesc();
}
-Allocator* ProcessState::GetCPUAllocator(int numa_node) {
+VisitableAllocator* ProcessState::GetCPUAllocator(int numa_node) {
CHECK_GE(numa_node, 0);
if (!numa_enabled_) numa_node = 0;
mutex_lock lock(mu_);
while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
- // If visitors have been defined we need an Allocator built from
- // a SubAllocator. Prefer BFCAllocator, but fall back to PoolAllocator
- // depending on env var setting.
- const bool alloc_visitors_defined =
- (!cpu_alloc_visitors_.empty() || !cpu_free_visitors_.empty());
bool use_bfc_allocator = false;
- Status status = ReadBoolFromEnvVar(
- "TF_CPU_ALLOCATOR_USE_BFC", alloc_visitors_defined, &use_bfc_allocator);
+ // TODO(reedwm): Switch default to BGFAllocator if it's at least as fast and
+ // efficient.
+ Status status = ReadBoolFromEnvVar("TF_CPU_ALLOCATOR_USE_BFC", false,
+ &use_bfc_allocator);
if (!status.ok()) {
LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
}
- Allocator* allocator = nullptr;
- SubAllocator* sub_allocator =
- (alloc_visitors_defined || use_bfc_allocator)
- ? new BasicCPUAllocator(numa_enabled_ ? numa_node : -1,
- cpu_alloc_visitors_, cpu_free_visitors_)
- : nullptr;
+ VisitableAllocator* allocator;
if (use_bfc_allocator) {
// TODO(reedwm): evaluate whether 64GB by default is the best choice.
int64 cpu_mem_limit_in_mb = -1;
@@ -103,63 +95,34 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
}
int64 cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20);
- DCHECK(sub_allocator);
- allocator =
- new BFCAllocator(sub_allocator, cpu_mem_limit, true /*allow_growth*/,
- "bfc_cpu_allocator_for_gpu" /*name*/);
+ allocator = new BFCAllocator(
+ new BasicCPUAllocator(numa_enabled_ ? numa_node : -1), cpu_mem_limit,
+ true /*allow_growth*/, "bfc_cpu_allocator_for_gpu" /*name*/);
VLOG(2) << "Using BFCAllocator with memory limit of "
<< cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator";
- } else if (alloc_visitors_defined) {
- DCHECK(sub_allocator);
- allocator =
- new PoolAllocator(100 /*pool_size_limit*/, true /*auto_resize*/,
- sub_allocator, new NoopRounder, "cpu_pool");
+ } else {
+ allocator = new PoolAllocator(
+ 100 /*pool_size_limit*/, true /*auto_resize*/,
+ new BasicCPUAllocator(numa_enabled_ ? numa_node : -1),
+ new NoopRounder, "cpu_pool");
VLOG(2) << "Using PoolAllocator for ProcessState CPU allocator "
<< "numa_enabled_=" << numa_enabled_
<< " numa_node=" << numa_node;
- } else {
- DCHECK(!sub_allocator);
- allocator = cpu_allocator();
}
- if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
+ if (LogMemory::IsEnabled()) {
// Wrap the allocator to track allocation ids for better logging
// at the cost of performance.
- allocator = new TrackingAllocator(allocator, true);
+ allocator = new TrackingVisitableAllocator(allocator, true);
}
cpu_allocators_.push_back(allocator);
- if (!sub_allocator) {
- DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty());
- }
}
return cpu_allocators_[numa_node];
}
-void ProcessState::AddCPUAllocVisitor(SubAllocator::Visitor visitor) {
- VLOG(1) << "AddCPUAllocVisitor";
- mutex_lock lock(mu_);
- CHECK_EQ(0, cpu_allocators_.size()) // Crash OK
- << "AddCPUAllocVisitor must be called prior to first call to "
- "ProcessState::GetCPUAllocator";
- cpu_alloc_visitors_.push_back(std::move(visitor));
-}
-
-void ProcessState::AddCPUFreeVisitor(SubAllocator::Visitor visitor) {
- mutex_lock lock(mu_);
- CHECK_EQ(0, cpu_allocators_.size()) // Crash OK
- << "AddCPUFreeVisitor must be called prior to first call to "
- "ProcessState::GetCPUAllocator";
- cpu_free_visitors_.push_back(std::move(visitor));
-}
-
void ProcessState::TestOnlyReset() {
mutex_lock lock(mu_);
- // Don't delete this value because it's static.
- Allocator* default_cpu_allocator = cpu_allocator();
mem_desc_map_.clear();
- for (Allocator* a : cpu_allocators_) {
- if (a != default_cpu_allocator) delete a;
- }
- cpu_allocators_.clear();
+ gtl::STLDeleteElements(&cpu_allocators_);
gtl::STLDeleteElements(&cpu_al_);
}
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index cac312d849..2892677333 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -30,6 +30,7 @@ limitations under the License.
namespace tensorflow {
class Allocator;
+class VisitableAllocator;
class PoolAllocator;
// Singleton that manages per-process state, e.g. allocation of
@@ -64,15 +65,7 @@ class ProcessState {
// Returns the one CPUAllocator used for the given numa_node.
// TEMPORARY: ignores numa_node.
- Allocator* GetCPUAllocator(int numa_node);
-
- // Registers alloc visitor for the CPU allocator(s).
- // REQUIRES: must be called before GetCPUAllocator.
- void AddCPUAllocVisitor(SubAllocator::Visitor v);
-
- // Registers free visitor for the CPU allocator(s).
- // REQUIRES: must be called before GetCPUAllocator.
- void AddCPUFreeVisitor(SubAllocator::Visitor v);
+ VisitableAllocator* GetCPUAllocator(int numa_node);
typedef std::unordered_map<const void*, MemDesc> MDMap;
@@ -94,9 +87,7 @@ class ProcessState {
mutex mu_;
- std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_);
- std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_);
- std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_);
+ std::vector<VisitableAllocator*> cpu_allocators_ GUARDED_BY(mu_);
virtual ~ProcessState();
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index 9d59264899..103eee03b3 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -72,10 +72,9 @@ class RenamedDevice : public Device {
return underlying_->MakeGpuDevice();
}
- Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
- DeviceContext* dc,
- Allocator* allocator) override {
- return underlying_->ReinitializeGpuDevice(context, device, dc, allocator);
+ void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
+ DeviceContext* dc, Allocator* allocator) override {
+ underlying_->ReinitializeGpuDevice(context, device, dc, allocator);
}
Status MakeTensorFromProto(const TensorProto& tensor_proto,
diff --git a/tensorflow/core/common_runtime/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h
new file mode 100644
index 0000000000..ae0563a96a
--- /dev/null
+++ b/tensorflow/core/common_runtime/visitable_allocator.h
@@ -0,0 +1,79 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
+
+#include <functional>
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/tracking_allocator.h"
+
+namespace tensorflow {
+
+// Subclass VisitableAllocator instead of Allocator when a memory
+// allocator needs to enable some kind of registration/deregistration
+// of memory areas.
+class VisitableAllocator : public Allocator {
+ public:
+ // Visitor gets called with a pointer to a memory area and its
+ // size in bytes.
+ typedef std::function<void(void*, size_t)> Visitor;
+
+ // Register a visitor guaranteed to be called exactly once on each
+ // chunk of memory newly allocated from the underlying device.
+ // Typically, chunks will be reused and possibly sub-divided by a
+ // pool manager, so the calls will happen only once per process
+ // execution, not once per tensor (re)allocation.
+ virtual void AddAllocVisitor(Visitor visitor) = 0;
+
+ // Register a visitor guaranteed to be called on each chunk of
+ // memory returned to the underlying device.
+ virtual void AddFreeVisitor(Visitor visitor) = 0;
+};
+
+// Needed for cases when a VisitableAllocator gets wrapped for tracking.
+// Multiple-inheritance is considered acceptable in this case because
+// VisitableAllocator is a pure virtual interface and only TrackingAllocator
+// has default implementation.
+class TrackingVisitableAllocator : public TrackingAllocator,
+ public VisitableAllocator {
+ public:
+ TrackingVisitableAllocator(VisitableAllocator* allocator, bool track_ids)
+ : TrackingAllocator(allocator, track_ids), allocator_(allocator) {}
+ ~TrackingVisitableAllocator() override {}
+
+ string Name() override { return TrackingAllocator::Name(); }
+
+ void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+ return TrackingAllocator::AllocateRaw(alignment, num_bytes);
+ }
+
+ void DeallocateRaw(void* ptr) override {
+ TrackingAllocator::DeallocateRaw(ptr);
+ }
+
+ void AddAllocVisitor(Visitor visitor) override {
+ allocator_->AddAllocVisitor(visitor);
+ }
+
+ void AddFreeVisitor(Visitor visitor) override {
+ allocator_->AddFreeVisitor(visitor);
+ }
+
+ protected:
+ VisitableAllocator* allocator_;
+};
+} // namespace tensorflow
+#endif // TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_