Eliminate VisitableAllocator.

The visitor pattern is used to allow pre-registration of memory for DMA access, e.g. for fast GPU/CPU i/o and for RDMA networking. The VisitableAllocator interface was introduced to support this use some time ago, prior to SubAllocators. Memory registration works best if it's done infrequently, on large pieces of memory, rather than on every piece that's dynamically allocated/freed. This usage pattern fits the SubAllocator better than a general Allocator. This change moves memory allocation visitor access to SubAllocator and eliminates the VisitableAllocator subclass of Allocator. This change also more rigorously enforces the requirement that all Visitors be declared prior to memory allocation begining. This is accomplished by requiring that Visitors be provided to the SubAllocator constructor. This refactoring will ease an upcoming CL introducing NUMA specific CPU devices. It also should fix some performance pitfalls (e.g. accidental use of PoolAllocator) introduced by an earlier refactoring of ProcessState that was also in preparation for NUMA. It restores the default use of the cpu_allocator() value (i.e. no SubAllocator) by model executions that don't use allocation visitors (since visitor registration must precede the first allocation, hence can be detected at that time). PiperOrigin-RevId: 213371553
author: A. Unique TensorFlower <gardener@tensorflow.org> 2018-09-17 17:49:36 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-09-17 17:53:46 -0700
commit: 185aa89912376d4088c22615908696cd30f9951b (patch)
tree: ba465945c19904129b830cc2465a3ee7b58b9247 /tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
parent: 4338803b98cd825b0b1d810bcc51c9a79734feb6 (diff)
1 files changed, 75 insertions, 15 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 67caeb3495..7112c3afd4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -46,7 +47,11 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
 }
 
 TEST(GPUBFCAllocatorTest, NoDups) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   CheckStats(&a, 0, 0, 0, 0);
 
   // Allocate a lot of raw pointers
@@ -75,7 +80,11 @@ TEST(GPUBFCAllocatorTest, NoDups) {
 }
 
 TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   // Allocate 256 raw pointers of sizes between 100 bytes and about
   // a meg
   random::PhiloxRandom philox(123, 17);
@@ -133,7 +142,11 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
 }
 
 TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   CheckStats(&a, 0, 0, 0, 0);
 
   float* first_ptr = a.Allocate<float>(1024);
@@ -168,18 +181,30 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
 }
 
 TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   float* ptr = a.Allocate<float>(0);
   EXPECT_EQ(nullptr, ptr);
 }
 
 TEST(GPUBFCAllocatorTest, TracksSizes) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   EXPECT_EQ(true, a.TracksAllocationSizes());
 }
 
 TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   float* t1 = a.Allocate<float>(1);
   EXPECT_EQ(4, a.RequestedSize(t1));
   EXPECT_EQ(256, a.AllocatedSize(t1));
@@ -187,8 +212,12 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
 }
 
 TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
   // Configure a 1MiB byte limit
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc");
+  GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc");
 
   float* first_ptr = a.Allocate<float>(1 << 6);
   float* second_ptr = a.Allocate<float>(1 << 20);
@@ -203,7 +232,11 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
   options.set_allow_growth(true);
 
   // Max of 2GiB, but starts out small.
-  GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc");
 
   // Allocate 10 raw pointers of sizes between 100 bytes and about
   // 64 megs.
@@ -264,8 +297,15 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
 }
 
 TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
-  GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
-  GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc");
+  sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc");
   void* amem = a.AllocateRaw(1, 1);
   void* bmem = b.AllocateRaw(1, 1 << 30);
   a.DeallocateRaw(amem);
@@ -273,7 +313,11 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
 }
 
 static void BM_Allocation(int iters) {
-  GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
   // Exercise a few different allocation sizes
   std::vector<size_t> sizes = {256,        4096,      16384,    524288,
                                512,        1048576,   10485760, 104857600,
@@ -289,7 +333,11 @@ static void BM_Allocation(int iters) {
 BENCHMARK(BM_Allocation);
 
 static void BM_AllocationThreaded(int iters, int num_threads) {
-  GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
   thread::ThreadPool pool(Env::Default(), "test", num_threads);
   std::atomic_int_fast32_t count(iters);
   mutex done_lock;
@@ -325,7 +373,11 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16);
 // A more complex benchmark that defers deallocation of an object for
 // "delay" allocations.
 static void BM_AllocationDelayed(int iters, int delay) {
-  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+  CudaGpuId cuda_gpu_id(0);
+  GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+      false /*use_unified_memory*/, {}, {});
+  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
   // Exercise a few different allocation sizes
   std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024};
   int size_index = 0;
@@ -363,7 +415,11 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
   // only methods inside this class can access private members of BFCAllocator.
 
   void TestBinDebugInfo() {
-    GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
+    CudaGpuId cuda_gpu_id(0);
+    GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+        GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+        false /*use_unified_memory*/, {}, {});
+    GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
 
     std::vector<void*> initial_ptrs;
     std::vector<size_t> initial_ptrs_allocated_sizes;
@@ -441,7 +497,11 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
   }
 
   void TestLog2FloorNonZeroSlow() {
-    GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc");
+    CudaGpuId cuda_gpu_id(0);
+    GPUMemAllocator* sub_allocator = new GPUMemAllocator(
+        GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id,
+        false /*use_unified_memory*/, {}, {});
+    GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc");
     EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0));
     EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1));
     EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2));
author	A. Unique TensorFlower <gardener@tensorflow.org>	2018-09-17 17:49:36 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-09-17 17:53:46 -0700
commit	185aa89912376d4088c22615908696cd30f9951b (patch)
tree	ba465945c19904129b830cc2465a3ee7b58b9247 /tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
parent	4338803b98cd825b0b1d810bcc51c9a79734feb6 (diff)