diff options
author | 2018-09-17 17:49:36 -0700 | |
---|---|---|
committer | 2018-09-17 17:53:46 -0700 | |
commit | 185aa89912376d4088c22615908696cd30f9951b (patch) | |
tree | ba465945c19904129b830cc2465a3ee7b58b9247 /tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc | |
parent | 4338803b98cd825b0b1d810bcc51c9a79734feb6 (diff) |
Eliminate VisitableAllocator.
The visitor pattern is used to allow pre-registration of memory for
DMA access, e.g. for fast GPU/CPU i/o and for RDMA networking. The
VisitableAllocator interface was introduced to support this use some
time ago, prior to SubAllocators. Memory registration works best if
it's done infrequently, on large pieces of memory, rather than on
every piece that's dynamically allocated/freed. This usage pattern
fits the SubAllocator better than a general Allocator. This change
moves memory allocation visitor access to SubAllocator and eliminates
the VisitableAllocator subclass of Allocator.
This change also more rigorously enforces the requirement that all
Visitors be declared prior to memory allocation begining. This is
accomplished by requiring that Visitors be provided to the SubAllocator
constructor.
This refactoring will ease an upcoming CL introducing
NUMA specific CPU devices. It also should fix some performance
pitfalls (e.g. accidental use of PoolAllocator) introduced by an
earlier refactoring of ProcessState that was also in preparation for
NUMA. It restores the default use of the cpu_allocator() value (i.e.
no SubAllocator) by model executions that don't use allocation
visitors (since visitor registration must precede the first allocation,
hence can be detected at that time).
PiperOrigin-RevId: 213371553
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc')
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc | 90 |
1 files changed, 75 insertions, 15 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc index 67caeb3495..7112c3afd4 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc @@ -21,6 +21,7 @@ limitations under the License. #include <vector> #include "tensorflow/core/common_runtime/gpu/gpu_id.h" +#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h" #include "tensorflow/core/common_runtime/gpu/gpu_init.h" #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/gtl/inlined_vector.h" @@ -46,7 +47,11 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use, } TEST(GPUBFCAllocatorTest, NoDups) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); // Allocate a lot of raw pointers @@ -75,7 +80,11 @@ TEST(GPUBFCAllocatorTest, NoDups) { } TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); // Allocate 256 raw pointers of sizes between 100 bytes and about // a meg random::PhiloxRandom philox(123, 17); @@ -133,7 +142,11 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { } TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); float* first_ptr = a.Allocate<float>(1024); @@ -168,18 +181,30 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { } TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); float* ptr = a.Allocate<float>(0); EXPECT_EQ(nullptr, ptr); } TEST(GPUBFCAllocatorTest, TracksSizes) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); EXPECT_EQ(true, a.TracksAllocationSizes()); } TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); float* t1 = a.Allocate<float>(1); EXPECT_EQ(4, a.RequestedSize(t1)); EXPECT_EQ(256, a.AllocatedSize(t1)); @@ -187,8 +212,12 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { } TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) { + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); // Configure a 1MiB byte limit - GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc"); + GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc"); float* first_ptr = a.Allocate<float>(1 << 6); float* second_ptr = a.Allocate<float>(1 << 20); @@ -203,7 +232,11 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { options.set_allow_growth(true); // Max of 2GiB, but starts out small. - GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc"); // Allocate 10 raw pointers of sizes between 100 bytes and about // 64 megs. @@ -264,8 +297,15 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { } TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { - GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); - GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc"); + sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc"); void* amem = a.AllocateRaw(1, 1); void* bmem = b.AllocateRaw(1, 1 << 30); a.DeallocateRaw(amem); @@ -273,7 +313,11 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { } static void BM_Allocation(int iters) { - GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<size_t> sizes = {256, 4096, 16384, 524288, 512, 1048576, 10485760, 104857600, @@ -289,7 +333,11 @@ static void BM_Allocation(int iters) { BENCHMARK(BM_Allocation); static void BM_AllocationThreaded(int iters, int num_threads) { - GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc"); thread::ThreadPool pool(Env::Default(), "test", num_threads); std::atomic_int_fast32_t count(iters); mutex done_lock; @@ -325,7 +373,11 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16); // A more complex benchmark that defers deallocation of an object for // "delay" allocations. static void BM_AllocationDelayed(int iters, int delay) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024}; int size_index = 0; @@ -363,7 +415,11 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { // only methods inside this class can access private members of BFCAllocator. void TestBinDebugInfo() { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); std::vector<void*> initial_ptrs; std::vector<size_t> initial_ptrs_allocated_sizes; @@ -441,7 +497,11 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { } void TestLog2FloorNonZeroSlow() { - GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc"); + CudaGpuId cuda_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + false /*use_unified_memory*/, {}, {}); + GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc"); EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0)); EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1)); EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2)); |