diff options
author | TensorFlower Gardener <gardener@tensorflow.org> | 2018-09-19 10:51:51 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-09-19 10:52:22 -0700 |
commit | 96e7185cdb399345fb6e4c656d1b3088f848cf5a (patch) | |
tree | 1f71681791c0cc3077b3cae9703fa8ca26037d37 /tensorflow/core/common_runtime | |
parent | 428f7037bef6dbfdd01a4283a6c76221d381ef7e (diff) | |
parent | 1b166c7e6f30bf7179f31764b3615e63025a7472 (diff) |
Merge pull request #21000 from ROCmSoftwarePlatform:upstream-staging-gpu-common-runtime-1
PiperOrigin-RevId: 213653830
Diffstat (limited to 'tensorflow/core/common_runtime')
16 files changed, 330 insertions, 295 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h index 6b6de80734..3470f7a9f7 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h @@ -34,11 +34,11 @@ namespace tensorflow { // Suballocator for GPU memory. class GPUMemAllocator : public SubAllocator { public: - // 'cuda_gpu_id' refers to the ID of the GPU device within + // 'platform_gpu_id' refers to the ID of the GPU device within // the process and must reference a valid ID in the process. // Note: stream_exec cannot be null. - explicit GPUMemAllocator(se::StreamExecutor* stream_exec, CudaGpuId gpu_id, - bool use_unified_memory, + explicit GPUMemAllocator(se::StreamExecutor* stream_exec, + PlatformGpuId gpu_id, bool use_unified_memory, const std::vector<Visitor>& alloc_visitors, const std::vector<Visitor>& free_visitors) : SubAllocator(alloc_visitors, free_visitors), @@ -76,7 +76,7 @@ class GPUMemAllocator : public SubAllocator { private: se::StreamExecutor* stream_exec_; // not owned, non-null - const CudaGpuId gpu_id_; + const PlatformGpuId gpu_id_; const bool use_unified_memory_ = false; TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator); diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc index 7112c3afd4..e313135d8d 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc @@ -47,10 +47,10 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use, } TEST(GPUBFCAllocatorTest, NoDups) { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); @@ -80,10 +80,10 @@ TEST(GPUBFCAllocatorTest, NoDups) { } TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); // Allocate 256 raw pointers of sizes between 100 bytes and about // a meg @@ -142,10 +142,10 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { } TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); @@ -181,29 +181,29 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { } TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); float* ptr = a.Allocate<float>(0); EXPECT_EQ(nullptr, ptr); } TEST(GPUBFCAllocatorTest, TracksSizes) { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); EXPECT_EQ(true, a.TracksAllocationSizes()); } TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); float* t1 = a.Allocate<float>(1); EXPECT_EQ(4, a.RequestedSize(t1)); @@ -212,10 +212,10 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { } TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); // Configure a 1MiB byte limit GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc"); @@ -232,10 +232,10 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { options.set_allow_growth(true); // Max of 2GiB, but starts out small. - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc"); // Allocate 10 raw pointers of sizes between 100 bytes and about @@ -297,14 +297,14 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { } TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc"); sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc"); void* amem = a.AllocateRaw(1, 1); void* bmem = b.AllocateRaw(1, 1 << 30); @@ -313,10 +313,10 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { } static void BM_Allocation(int iters) { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<size_t> sizes = {256, 4096, 16384, 524288, @@ -333,10 +333,10 @@ static void BM_Allocation(int iters) { BENCHMARK(BM_Allocation); static void BM_AllocationThreaded(int iters, int num_threads) { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc"); thread::ThreadPool pool(Env::Default(), "test", num_threads); std::atomic_int_fast32_t count(iters); @@ -373,10 +373,10 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16); // A more complex benchmark that defers deallocation of an object for // "delay" allocations. static void BM_AllocationDelayed(int iters, int delay) { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024}; @@ -415,10 +415,10 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { // only methods inside this class can access private members of BFCAllocator. void TestBinDebugInfo() { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc"); std::vector<void*> initial_ptrs; @@ -497,10 +497,10 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { } void TestLog2FloorNonZeroSlow() { - CudaGpuId cuda_gpu_id(0); + PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc"); EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0)); EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1)); diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc index 8e14f1ea75..d85ca8892f 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc @@ -28,9 +28,10 @@ limitations under the License. namespace tensorflow { GPUcudaMallocAllocator::GPUcudaMallocAllocator(Allocator* allocator, - CudaGpuId cuda_gpu_id) + PlatformGpuId platform_gpu_id) : base_allocator_(allocator) { - stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + stream_exec_ = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); } GPUcudaMallocAllocator::~GPUcudaMallocAllocator() { delete base_allocator_; } diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h index 3d1d0ef481..8df3724bc4 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h @@ -31,7 +31,8 @@ namespace tensorflow { // allocated memory. class GPUcudaMallocAllocator : public Allocator { public: - explicit GPUcudaMallocAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); + explicit GPUcudaMallocAllocator(Allocator* allocator, + PlatformGpuId platform_gpu_id); ~GPUcudaMallocAllocator() override; string Name() override { return "gpu_debug"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc index 6bad66dcec..989ddbe4af 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc @@ -74,9 +74,10 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) { // GPUDebugAllocator // ----------------------------------------------------------------------------- GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator, - CudaGpuId cuda_gpu_id) + PlatformGpuId platform_gpu_id) : base_allocator_(allocator) { - stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + stream_exec_ = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); } GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; } @@ -151,9 +152,10 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) { // GPUNanResetAllocator // ----------------------------------------------------------------------------- GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator, - CudaGpuId cuda_gpu_id) + PlatformGpuId platform_gpu_id) : base_allocator_(allocator) { - stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + stream_exec_ = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); } GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; } diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h index 0f27ff4384..17757a106c 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h @@ -33,7 +33,8 @@ namespace tensorflow { // allocated memory. class GPUDebugAllocator : public Allocator { public: - explicit GPUDebugAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); + explicit GPUDebugAllocator(Allocator* allocator, + PlatformGpuId platform_gpu_id); ~GPUDebugAllocator() override; string Name() override { return "gpu_debug"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; @@ -62,7 +63,8 @@ class GPUDebugAllocator : public Allocator { // user forgets to initialize the memory. class GPUNanResetAllocator : public Allocator { public: - explicit GPUNanResetAllocator(Allocator* allocator, CudaGpuId cuda_gpu_id); + explicit GPUNanResetAllocator(Allocator* allocator, + PlatformGpuId platform_gpu_id); ~GPUNanResetAllocator() override; string Name() override { return "gpu_nan_reset"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc index 98283cd846..aca08a7e33 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc @@ -34,13 +34,14 @@ namespace tensorflow { namespace { TEST(GPUDebugAllocatorTest, OverwriteDetection_None) { - const CudaGpuId cuda_gpu_id(0); + const PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), - cuda_gpu_id); - auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + platform_gpu_id); + auto stream_exec = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); for (int s : {8}) { std::vector<int64> cpu_array(s); @@ -61,14 +62,14 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) { for (int s : {8, 211}) { EXPECT_DEATH( { - const CudaGpuId cuda_gpu_id(0); + const PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), - cuda_gpu_id, false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), - cuda_gpu_id); + platform_gpu_id); auto stream_exec = - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); std::vector<int64> cpu_array(s); memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64)); @@ -97,14 +98,14 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) { for (int s : {8, 22}) { EXPECT_DEATH( { - const CudaGpuId cuda_gpu_id(0); + const PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), - cuda_gpu_id, false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), - cuda_gpu_id); + platform_gpu_id); auto stream_exec = - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); std::vector<int64> cpu_array(s); memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64)); @@ -130,13 +131,14 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) { } TEST(GPUDebugAllocatorTest, ResetToNan) { - const CudaGpuId cuda_gpu_id(0); + const PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUNanResetAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), - cuda_gpu_id); - auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + platform_gpu_id); + auto stream_exec = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); std::vector<float> cpu_array(1024); std::vector<float> cpu_array_result(1024); @@ -173,16 +175,17 @@ TEST(GPUDebugAllocatorTest, ResetToNan) { } TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { - const CudaGpuId cuda_gpu_id(0); + const PlatformGpuId platform_gpu_id(0); // NaN reset must be the outer-most allocator. GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUNanResetAllocator a( new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), - cuda_gpu_id), - cuda_gpu_id); - auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + platform_gpu_id), + platform_gpu_id); + auto stream_exec = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); std::vector<float> cpu_array(1024); std::vector<float> cpu_array_result(1024); @@ -219,24 +222,24 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { } TEST(GPUDebugAllocatorTest, TracksSizes) { - const CudaGpuId cuda_gpu_id(0); + const PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUDebugAllocator a(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), - cuda_gpu_id); + platform_gpu_id); EXPECT_EQ(true, a.TracksAllocationSizes()); } TEST(GPUDebugAllocatorTest, AllocatedVsRequested) { - const CudaGpuId cuda_gpu_id(0); + const PlatformGpuId platform_gpu_id(0); GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, - false /*use_unified_memory*/, {}, {}); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, false /*use_unified_memory*/, {}, {}); GPUNanResetAllocator a( new GPUDebugAllocator(new GPUBFCAllocator(sub_allocator, 1 << 30, ""), - cuda_gpu_id), - cuda_gpu_id); + platform_gpu_id), + platform_gpu_id); float* t1 = a.Allocate<float>(1); EXPECT_EQ(4, a.RequestedSize(t1)); EXPECT_EQ(256, a.AllocatedSize(t1)); diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index 50e61b7e00..cf3faf68ff 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -104,9 +104,9 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface { reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize); stream_ = cuda_stream; allocator_ = alloc; - CudaGpuId cuda_gpu_id; - TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); - device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id.value()]; + PlatformGpuId platform_gpu_id; + TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); + device_prop_ = &Eigen::m_deviceProperties[platform_gpu_id.value()]; } const cudaStream_t& stream() const override { return *stream_; } @@ -342,9 +342,10 @@ Status BaseGPUDevice::Init(const SessionOptions& options) { gpu_device_info_->stream = streams_[0]->compute; gpu_device_info_->default_context = device_contexts_[0]; gpu_device_info_->event_mgr = em_.get(); - CudaGpuId cuda_gpu_id; - TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id_, &cuda_gpu_id)); - gpu_device_info_->gpu_id = cuda_gpu_id.value(); + PlatformGpuId platform_gpu_id; + TF_RETURN_IF_ERROR( + GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id)); + gpu_device_info_->gpu_id = platform_gpu_id.value(); set_tensorflow_gpu_device_info(gpu_device_info_); // Whether and how the GPU device uses its own threadpool. @@ -700,9 +701,9 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice { Eigen::GpuDevice device_; }; -// Parse 'visible_device_list' into a list of CUDA GPU ids. +// Parse 'visible_device_list' into a list of platform GPU ids. Status ParseVisibleDeviceList(const string& visible_device_list, - std::vector<CudaGpuId>* visible_gpu_order) { + std::vector<PlatformGpuId>* visible_gpu_order) { visible_gpu_order->clear(); se::Platform* gpu_manager = GPUMachineManager(); @@ -717,26 +718,28 @@ Status ParseVisibleDeviceList(const string& visible_device_list, } else { const std::vector<string> order_str = str_util::Split(visible_device_list, ','); - for (const string& cuda_gpu_id_str : order_str) { - int32 cuda_gpu_id; - if (!strings::safe_strto32(cuda_gpu_id_str, &cuda_gpu_id)) { + for (const string& platform_gpu_id_str : order_str) { + int32 platform_gpu_id; + if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) { return errors::InvalidArgument( "Could not parse entry in 'visible_device_list': '", - cuda_gpu_id_str, "'. visible_device_list = ", visible_device_list); + platform_gpu_id_str, "'. visible_device_list = ", + visible_device_list); } - if (cuda_gpu_id < 0 || cuda_gpu_id >= gpu_manager->VisibleDeviceCount()) { + if (platform_gpu_id < 0 || + platform_gpu_id >= gpu_manager->VisibleDeviceCount()) { return errors::InvalidArgument( - "'visible_device_list' listed an invalid GPU id '", cuda_gpu_id, + "'visible_device_list' listed an invalid GPU id '", platform_gpu_id, "' but visible device count is ", gpu_manager->VisibleDeviceCount()); } - visible_gpu_order->push_back(CudaGpuId(cuda_gpu_id)); + visible_gpu_order->push_back(PlatformGpuId(platform_gpu_id)); } } // Validate no repeats. - std::set<CudaGpuId> visible_device_set(visible_gpu_order->begin(), - visible_gpu_order->end()); + std::set<PlatformGpuId> visible_device_set(visible_gpu_order->begin(), + visible_gpu_order->end()); if (visible_device_set.size() != visible_gpu_order->size()) { return errors::InvalidArgument( "visible_device_list contained a duplicate entry: ", @@ -747,8 +750,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list, Status VerifyVirtualDeviceSettings( const size_t num_gpus_to_use, const GPUOptions& gpu_options, - const std::vector<CudaGpuId>& visible_gpu_order, - const std::vector<CudaGpuId>& valid_cuda_gpu_ids) { + const std::vector<PlatformGpuId>& visible_gpu_order, + const std::vector<PlatformGpuId>& valid_platform_gpu_ids) { const auto& virtual_devices = gpu_options.experimental().virtual_devices(); CHECK(!virtual_devices.empty()); if (gpu_options.per_process_gpu_memory_fraction() > 0) { @@ -770,11 +773,11 @@ Status VerifyVirtualDeviceSettings( " #GPUs in visible_device_list: ", visible_gpu_order.size(), " virtual_devices.size(): ", virtual_devices.size()); } - if (valid_cuda_gpu_ids.size() != virtual_devices.size()) { + if (valid_platform_gpu_ids.size() != virtual_devices.size()) { return errors::Unknown( "The number of valid GPUs doesn't match the number of elements in " "the virtual_devices list.", - " #valid GPUs: ", valid_cuda_gpu_ids.size(), + " #valid GPUs: ", valid_platform_gpu_ids.size(), " virtual_devices.size(): ", virtual_devices.size()); } return Status::OK(); @@ -816,18 +819,18 @@ int64 MinSystemMemory(int64 available_memory) { } // Get the memory limit for the virtual device being created on GPU with -// 'cuda_gpu_id', when that virtual device is the only virtual device being +// 'platform_gpu_id', when that virtual device is the only virtual device being // created on that GPU. Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options, - CudaGpuId cuda_gpu_id, + PlatformGpuId platform_gpu_id, int64* memory_limit) { int64 total_memory = 0; int64 available_memory = 0; se::StreamExecutor* se = - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) { return errors::Unknown("Failed to query available memory for GPU ", - cuda_gpu_id.value()); + platform_gpu_id.value()); } int64 allocated_memory = 0; @@ -928,8 +931,8 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, num_gpus_to_use = iter->second; } const auto& gpu_options = options.config.gpu_options(); - std::vector<CudaGpuId> visible_gpu_order; - std::vector<CudaGpuId> valid_cuda_gpu_ids; + std::vector<PlatformGpuId> visible_gpu_order; + std::vector<PlatformGpuId> valid_platform_gpu_ids; // If we aren't going to use any GPUs, don't initialize them. // We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0, // because it treats an empty gpu_options.visible_device_list as 'all GPUs are @@ -938,12 +941,12 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(), &visible_gpu_order)); TF_RETURN_IF_ERROR( - GetValidDeviceIds(visible_gpu_order, &valid_cuda_gpu_ids)); + GetValidDeviceIds(visible_gpu_order, &valid_platform_gpu_ids)); } - if (num_gpus_to_use > valid_cuda_gpu_ids.size()) { - num_gpus_to_use = valid_cuda_gpu_ids.size(); + if (num_gpus_to_use > valid_platform_gpu_ids.size()) { + num_gpus_to_use = valid_platform_gpu_ids.size(); } - if (!valid_cuda_gpu_ids.empty()) { + if (!valid_platform_gpu_ids.empty()) { // Save the original device. int original_device = 0; cudaError_t err = cudaGetDevice(&original_device); @@ -953,17 +956,18 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, } // Force to implicitly initialize CUDA runtime on each valid GPU before // CreateGPUDevice(). - for (CudaGpuId cuda_gpu_id : valid_cuda_gpu_ids) { - err = cudaSetDevice(cuda_gpu_id.value()); + for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) { + err = cudaSetDevice(platform_gpu_id.value()); if (err != cudaSuccess) { - return errors::Internal("cudaSetDevice() on GPU:", cuda_gpu_id.value(), - " failed. Status: ", cudaGetErrorString(err)); + return errors::Internal("cudaSetDevice() on GPU:", + platform_gpu_id.value(), " failed. Status: ", + cudaGetErrorString(err)); } err = cudaFree(nullptr); if (err != cudaSuccess) { - return errors::Internal( - "CUDA runtime implicit initialization on GPU:", cuda_gpu_id.value(), - " failed. Status: ", cudaGetErrorString(err)); + return errors::Internal("CUDA runtime implicit initialization on GPU:", + platform_gpu_id.value(), " failed. Status: ", + cudaGetErrorString(err)); } } // Reset to the original device. @@ -989,10 +993,10 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, LOG(INFO) << line_buf; for (int i = 0; i < visible_gpu_order.size(); ++i) { line_buf = strings::StrCat(visible_gpu_order[i].value(), ": "); - CudaGpuId cuda_id_i = visible_gpu_order[i]; + PlatformGpuId gpu_id_i = visible_gpu_order[i]; for (int j = 0; j < visible_gpu_order.size(); ++j) { - CudaGpuId cuda_id_j = visible_gpu_order[j]; - if (im.directed_links.find({cuda_id_i, cuda_id_j}) != + PlatformGpuId gpu_id_j = visible_gpu_order[j]; + if (im.directed_links.find({gpu_id_i, gpu_id_j}) != im.directed_links.end()) { line_buf.append("Y "); } else { @@ -1005,22 +1009,23 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, const auto& virtual_devices = gpu_options.experimental().virtual_devices(); if (!virtual_devices.empty()) { - TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings( - num_gpus_to_use, gpu_options, visible_gpu_order, valid_cuda_gpu_ids)); + TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(num_gpus_to_use, gpu_options, + visible_gpu_order, + valid_platform_gpu_ids)); // We've verified that num_gpus_to_use >= virtual_devices.size(). num_gpus_to_use = virtual_devices.size(); CHECK(gpu_options.visible_device_list().empty() || - valid_cuda_gpu_ids == visible_gpu_order); + valid_platform_gpu_ids == visible_gpu_order); } int next_tf_gpu_id = 0; std::vector<int64> memory_limit_bytes; for (int i = 0; i < num_gpus_to_use; ++i) { - const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i]; + const PlatformGpuId platform_gpu_id = valid_platform_gpu_ids[i]; if (virtual_devices.empty() || virtual_devices.Get(i).memory_limit_mb_size() == 0) { int64 single_virtual_device_memory_limit = 0; TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit( - gpu_options, cuda_gpu_id, &single_virtual_device_memory_limit)); + gpu_options, platform_gpu_id, &single_virtual_device_memory_limit)); memory_limit_bytes.push_back(single_virtual_device_memory_limit); } else { const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb(); @@ -1033,7 +1038,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, TfGpuId tf_gpu_id(next_tf_gpu_id); ++next_tf_gpu_id; TF_RETURN_IF_ERROR( - GpuIdManager::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id)); + GpuIdManager::InsertTfPlatformGpuIdPair(tf_gpu_id, platform_gpu_id)); } } const int num_tf_gpus = next_tf_gpu_id; @@ -1058,7 +1063,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, return Status::OK(); } -static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id, +static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id, const se::DeviceDescription& desc) { int cc_major; int cc_minor; @@ -1067,9 +1072,8 @@ static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id, cc_minor = 0; } // LINT.IfChange - return strings::StrCat("device: ", cuda_gpu_id.value(), - ", name: ", desc.name(), - ", pci bus id: ", desc.pci_bus_id(), + return strings::StrCat("device: ", platform_gpu_id.value(), ", name: ", + desc.name(), ", pci bus id: ", desc.pci_bus_id(), ", compute capability: ", cc_major, ".", cc_minor); // LINT.ThenChange(//tensorflow/python/platform/test.py) } @@ -1084,12 +1088,13 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options, const string device_name = strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value()); GpuIdUtil::CheckValidTfGpuId(tf_gpu_id); - CudaGpuId cuda_gpu_id; - TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); + PlatformGpuId platform_gpu_id; + TF_RETURN_IF_ERROR( + GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); int numa_node = dev_locality.numa_node(); se::StreamExecutor* se = - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); const se::DeviceDescription& desc = se->GetDeviceDescription(); GPUProcessState* process_state = GPUProcessState::singleton(); Allocator* gpu_allocator = process_state->GetGPUAllocator( @@ -1110,11 +1115,11 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options, // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit. BaseGPUDevice* gpu_device = CreateGPUDevice( options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality, - tf_gpu_id, GetShortDeviceDescription(cuda_gpu_id, desc), gpu_allocator, - ProcessState::singleton()->GetCPUAllocator(numa_node)); + tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, desc), + gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node)); LOG(INFO) << "Created TensorFlow device (" << device_name << " with " << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU (" - << GetShortDeviceDescription(cuda_gpu_id, desc) << ")"; + << GetShortDeviceDescription(platform_gpu_id, desc) << ")"; TF_RETURN_IF_ERROR(gpu_device->Init(options)); devices->push_back(gpu_device); @@ -1122,18 +1127,21 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options, } namespace { -std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> +std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>> GetPeerAccessMap(se::Platform* platform, - const std::vector<CudaGpuId>& visible_gpu_order) { - std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> map( - new std::map<std::pair<CudaGpuId, CudaGpuId>, bool>); - for (CudaGpuId cuda_gpu_i : visible_gpu_order) { - for (CudaGpuId cuda_gpu_j : visible_gpu_order) { + const std::vector<PlatformGpuId>& visible_gpu_order) { + std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>> map( + new std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>); + for (PlatformGpuId platform_gpu_i : visible_gpu_order) { + for (PlatformGpuId platform_gpu_j : visible_gpu_order) { se::StreamExecutor* from = - GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_i) + .ValueOrDie(); se::StreamExecutor* to = - GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie(); - (*map)[{cuda_gpu_i, cuda_gpu_j}] = from->CanEnablePeerAccessTo(to); + GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_j) + .ValueOrDie(); + (*map)[{platform_gpu_i, platform_gpu_j}] = + from->CanEnablePeerAccessTo(to); } } @@ -1143,19 +1151,19 @@ GetPeerAccessMap(se::Platform* platform, } // namespace Status BaseGPUDeviceFactory::GetInterconnectMaps( - const std::vector<CudaGpuId>& visible_gpu_order, se::Platform* gpu_manager, - std::vector<InterconnectMap>* maps) { + const std::vector<PlatformGpuId>& visible_gpu_order, + se::Platform* gpu_manager, std::vector<InterconnectMap>* maps) { // The default interconnect map is obtained from the StreamExecutor. auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order); maps->resize(1); InterconnectMap& imap = maps->at(0); imap.name = "StreamExecutor"; imap.strength = InterconnectMap::kStreamExecutorStrength; - for (CudaGpuId cuda_id_i : visible_gpu_order) { - for (CudaGpuId cuda_id_j : visible_gpu_order) { - if (cuda_id_i == cuda_id_j) continue; - if ((*access_map)[{cuda_id_i, cuda_id_j}]) { - imap.directed_links.insert({cuda_id_i, cuda_id_j}); + for (PlatformGpuId gpu_id_i : visible_gpu_order) { + for (PlatformGpuId gpu_id_j : visible_gpu_order) { + if (gpu_id_i == gpu_id_j) continue; + if ((*access_map)[{gpu_id_i, gpu_id_j}]) { + imap.directed_links.insert({gpu_id_i, gpu_id_j}); } } } @@ -1170,13 +1178,14 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( all_tf_gpu_ids.push_back(TfGpuId(i)); } for (TfGpuId tf_gpu_id : all_tf_gpu_ids) { - CudaGpuId cuda_gpu_id; - TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); + PlatformGpuId platform_gpu_id; + TF_RETURN_IF_ERROR( + GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); // Get GPU bus_id from its reported NUMA affinity. Because GPUs are // virtualized in some environments, we can't just use the GPU id. // NUMA locales are indexed from 0, buses are indexed from 1. se::StreamExecutor* se = - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); const se::DeviceDescription& desc = se->GetDeviceDescription(); int numa_node = desc.numa_node(); if (numa_node < 0) { @@ -1186,7 +1195,8 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( // may run into trouble later with data transfer operations. The // trouble may manifest as slower than expected performance, or // outright failures. - LOG(INFO) << "Could not identify NUMA node of CUDA gpu id " << cuda_gpu_id + LOG(INFO) << "Could not identify NUMA node of platform GPU id " + << platform_gpu_id << ", defaulting to 0. Your kernel may not have been built " << "with NUMA support."; numa_node = 0; @@ -1199,10 +1209,10 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( LocalLinks* links = dev_locality.mutable_links(); for (const InterconnectMap& imap : interconnects) { for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) { - CudaGpuId cuda_gpu_dst; + PlatformGpuId platform_gpu_dst; TF_RETURN_IF_ERROR( - GpuIdManager::TfToCudaGpuId(tf_gpu_dst, &cuda_gpu_dst)); - if (imap.directed_links.find({cuda_gpu_id, cuda_gpu_dst}) != + GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst)); + if (imap.directed_links.find({platform_gpu_id, platform_gpu_dst}) != imap.directed_links.end()) { InterconnectLink* ilink = links->add_link(); ilink->set_device_id(tf_gpu_dst.value()); @@ -1216,10 +1226,10 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( // add high strength links to the others. for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) { if (tf_gpu_id == tf_gpu_dst) continue; - CudaGpuId cuda_gpu_dst; + PlatformGpuId platform_gpu_dst; TF_RETURN_IF_ERROR( - GpuIdManager::TfToCudaGpuId(tf_gpu_dst, &cuda_gpu_dst)); - if (cuda_gpu_id == cuda_gpu_dst) { + GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst)); + if (platform_gpu_id == platform_gpu_dst) { InterconnectLink* ilink = links->add_link(); ilink->set_device_id(tf_gpu_dst.value()); ilink->set_type("SAME_DEVICE"); @@ -1228,9 +1238,9 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( } (*localities)[tf_gpu_id] = dev_locality; - VLOG(1) << "GPUDevice CudaGpuId " << cuda_gpu_id << " TfGpuId " << tf_gpu_id - << " on bus " << dev_locality.bus_id() << " numa: " << numa_node - << " pci: " << desc.pci_bus_id() + VLOG(1) << "GPUDevice PlatformGpuId " << platform_gpu_id << " TfGpuId " + << tf_gpu_id << " on bus " << dev_locality.bus_id() + << " numa: " << numa_node << " pci: " << desc.pci_bus_id() << " DeviceLocality: " << dev_locality.DebugString(); } return Status::OK(); @@ -1238,14 +1248,14 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( static int GetDefaultMinGPUMultiprocessorCount( se::Platform* gpu_manager, - const std::vector<CudaGpuId>& visible_gpu_order) { + const std::vector<PlatformGpuId>& visible_gpu_order) { static const int kDefaultMinGPUMultiprocessorCount = 8; // Find the highest multi-processor count across all visible GPUs. int max_count = -1; for (int i = 0; i < visible_gpu_order.size(); ++i) { auto exec_status = - GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_order[i]); + GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_order[i]); if (!exec_status.ok()) { continue; } @@ -1264,7 +1274,7 @@ static int GetDefaultMinGPUMultiprocessorCount( static int GetMinGPUMultiprocessorCount( se::Platform* gpu_manager, - const std::vector<CudaGpuId>& visible_gpu_order) { + const std::vector<PlatformGpuId>& visible_gpu_order) { const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT"); if (tf_min_gpu_core_count == nullptr || @@ -1342,18 +1352,20 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() { } Status EnablePeerAccess(se::Platform* platform, - const std::vector<CudaGpuId>& visible_gpu_order) { + const std::vector<PlatformGpuId>& visible_gpu_order) { int possible_peer_count = 0; int enabled_peer_count = 0; for (int i = 0; i < visible_gpu_order.size(); ++i) { - const CudaGpuId cuda_gpu_i = visible_gpu_order[i]; + const PlatformGpuId platform_gpu_i = visible_gpu_order[i]; for (int j = 0; j < visible_gpu_order.size(); ++j) { - const CudaGpuId cuda_gpu_j = visible_gpu_order[j]; + const PlatformGpuId platform_gpu_j = visible_gpu_order[j]; // We have already validated that ExecutorForDevice() calls return OK. se::StreamExecutor* from = - GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_i) + .ValueOrDie(); se::StreamExecutor* to = - GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_j) + .ValueOrDie(); if (from->CanEnablePeerAccessTo(to)) { ++possible_peer_count; @@ -1361,7 +1373,8 @@ Status EnablePeerAccess(se::Platform* platform, if (!status.ok()) { LOG(WARNING) << "Unable to enable peer access between device ordinals " - << cuda_gpu_i << " and " << cuda_gpu_j << ", status: " << status; + << platform_gpu_i << " and " << platform_gpu_j + << ", status: " << status; } else { ++enabled_peer_count; } @@ -1384,22 +1397,23 @@ Status EnablePeerAccess(se::Platform* platform, } // namespace Status BaseGPUDeviceFactory::GetValidDeviceIds( - const std::vector<CudaGpuId>& visible_gpu_order, - std::vector<CudaGpuId>* ids) { + const std::vector<PlatformGpuId>& visible_gpu_order, + std::vector<PlatformGpuId>* ids) { se::Platform* gpu_manager = GPUMachineManager(); bool new_gpu_found = false; for (int i = 0; i < visible_gpu_order.size(); ++i) { - const CudaGpuId cuda_gpu_id = visible_gpu_order[i]; + const PlatformGpuId visible_gpu_id = visible_gpu_order[i]; - // Only perform this once per visible cuda gpu id. - if (visible_gpu_initialized_[cuda_gpu_id.value()]) { + // Only perform this once per visible platform gpu id. + if (visible_gpu_initialized_[visible_gpu_id.value()]) { continue; } - visible_gpu_initialized_[cuda_gpu_id.value()] = true; + visible_gpu_initialized_[visible_gpu_id.value()] = true; new_gpu_found = true; - auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id); + auto executor = + GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_id); if (!executor.ok()) { return executor.status(); } @@ -1447,9 +1461,9 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds( // Filter out devices that don't have the right capability or power. for (int i = 0; i < visible_gpu_order.size(); ++i) { - const CudaGpuId visible_gpu_id = visible_gpu_order[i]; + const PlatformGpuId visible_gpu_id = visible_gpu_order[i]; auto exec_status = - GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_id); + GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_id); if (!exec_status.ok()) { LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id << " whose executor is in invalid state: " @@ -1498,7 +1512,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds( if (!ids->empty()) { std::vector<int> raw_ids(ids->size()); std::transform(ids->begin(), ids->end(), raw_ids.begin(), - [](CudaGpuId id) -> int { return id.value(); }); + [](PlatformGpuId id) -> int { return id.value(); }); LOG(INFO) << "Adding visible gpu devices: " << str_util::Join(raw_ids, ", "); } diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h index b3eea55758..b25fe8645f 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.h +++ b/tensorflow/core/common_runtime/gpu/gpu_device.h @@ -90,12 +90,12 @@ class BaseGPUDevice : public LocalDevice { DeviceContext* dc, Allocator* allocator) override; - // Returns the CUDA GPU id of this device within the native driver system; + // Returns the platform GPU id of this device within the native driver system; // e.g., for CUDA this is the ordinal of the GPU within the system. int gpu_id() const { - CudaGpuId cuda_gpu_id; - TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id_, &cuda_gpu_id)); - return cuda_gpu_id.value(); + PlatformGpuId platform_gpu_id; + TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id)); + return platform_gpu_id.value(); } // The executor that provides control for the device; e.g., for CUDA this @@ -173,14 +173,14 @@ class BaseGPUDeviceFactory : public DeviceFactory { int32 strength; static const int kSameDeviceStrength; static const int kStreamExecutorStrength; - std::set<std::pair<CudaGpuId, CudaGpuId>> directed_links; + std::set<std::pair<PlatformGpuId, PlatformGpuId>> directed_links; }; protected: // Populates *maps with interconnect maps for all local direct access // pathways between GPUs. virtual Status GetInterconnectMaps( - const std::vector<CudaGpuId>& visible_gpu_order, + const std::vector<PlatformGpuId>& visible_gpu_order, se::Platform* gpu_manager, std::vector<InterconnectMap>* maps); struct TfGpuIdHash { @@ -212,16 +212,16 @@ class BaseGPUDeviceFactory : public DeviceFactory { Allocator* gpu_allocator, Allocator* cpu_allocator) = 0; - // Returns into 'ids' the list of valid CUDA GPU ids, in the order that + // Returns into 'ids' the list of valid platform GPU ids, in the order that // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc, // based upon 'visible_gpu_order' which was generated by parsing // GPUOptions::visible_device_list which is a comma-separated list of CUDA GPU // ids. - Status GetValidDeviceIds(const std::vector<CudaGpuId>& visible_gpu_order, - std::vector<CudaGpuId>* ids); + Status GetValidDeviceIds(const std::vector<PlatformGpuId>& visible_gpu_order, + std::vector<PlatformGpuId>* ids); - // visible_gpu_initialized_[cuda_gpu_id] is true if visible GPU cuda_gpu_id - // has been initialized by the process. + // visible_gpu_initialized_[platform_gpu_id] is true if visible GPU + // platform_gpu_id has been initialized by the process. std::unordered_map<int, bool> visible_gpu_initialized_; }; diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc index daf59f0560..36294094e9 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc @@ -30,18 +30,21 @@ namespace tensorflow { namespace { const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0"; -int64 GetTotalGPUMemory(CudaGpuId gpu_id) { +int64 GetTotalGPUMemory(PlatformGpuId gpu_id) { se::StreamExecutor* se = - GpuIdUtil::ExecutorForCudaGpuId(GPUMachineManager(), gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(GPUMachineManager(), gpu_id) + .ValueOrDie(); int64 total_memory, available_memory; CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory)); return total_memory; } -Status GetComputeCapability(CudaGpuId gpu_id, int* cc_major, int* cc_minor) { +Status GetComputeCapability(PlatformGpuId gpu_id, int* cc_major, + int* cc_minor) { se::StreamExecutor* se = - GpuIdUtil::ExecutorForCudaGpuId(GPUMachineManager(), gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(GPUMachineManager(), gpu_id) + .ValueOrDie(); if (!se->GetDeviceDescription().cuda_compute_capability(cc_major, cc_minor)) { *cc_major = 0; *cc_minor = 0; @@ -223,7 +226,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) { // error. TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) { int cc_major, cc_minor; - TF_ASSERT_OK(GetComputeCapability(CudaGpuId(0), &cc_major, &cc_minor)); + TF_ASSERT_OK(GetComputeCapability(PlatformGpuId(0), &cc_major, &cc_minor)); // Exit early while running on Pascal or later GPUs. if (cc_major >= 6) { return; @@ -244,10 +247,10 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) { // more memory than what is available on the device. TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) { static constexpr double kGpuMemoryFraction = 1.2; - static constexpr CudaGpuId kCudaGpuId(0); + static constexpr PlatformGpuId kPlatformGpuId(0); int cc_major, cc_minor; - TF_ASSERT_OK(GetComputeCapability(kCudaGpuId, &cc_major, &cc_minor)); + TF_ASSERT_OK(GetComputeCapability(kPlatformGpuId, &cc_major, &cc_minor)); // Exit early if running on pre-Pascal GPUs. if (cc_major < 6) { LOG(INFO) @@ -262,7 +265,7 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) { ASSERT_EQ(1, devices.size()); int64 memory_limit = devices[0]->attributes().memory_limit(); - ASSERT_EQ(memory_limit, static_cast<int64>(GetTotalGPUMemory(kCudaGpuId) * + ASSERT_EQ(memory_limit, static_cast<int64>(GetTotalGPUMemory(kPlatformGpuId) * kGpuMemoryFraction)); AllocatorAttributes allocator_attributes = AllocatorAttributes(); diff --git a/tensorflow/core/common_runtime/gpu/gpu_id.h b/tensorflow/core/common_runtime/gpu/gpu_id.h index 2a6caea296..f0d9022821 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_id.h +++ b/tensorflow/core/common_runtime/gpu/gpu_id.h @@ -25,10 +25,10 @@ namespace tensorflow { // physical machine, it can be filtered by CUDA environment variable // CUDA_VISIBLE_DEVICES. Note that this id is not visible to Tensorflow, but // result after filtering by CUDA_VISIBLE_DEVICES is visible to TF and is -// called CUDA GPU id as below. See +// called platform GPU id as below. See // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars // for more details. -// - CUDA GPU id (also called *visible* GPU id in +// - *platform* GPU id (also called *visible* GPU id in // third_party/tensorflow/core/protobuf/config.proto): this is the id that is // visible to Tensorflow after filtering by CUDA_VISIBLE_DEVICES, and is // generated by the CUDA GPU driver. It starts from 0 and is used for CUDA API @@ -39,14 +39,14 @@ namespace tensorflow { // field of the device name "/device:GPU:<id>", and is also the identifier of // a BaseGPUDevice. Note that the configuration allows us to create multiple // BaseGPUDevice per GPU hardware in order to use multi CUDA streams on the -// hardware, so the mapping between TF GPU id and CUDA GPU id is not a 1:1 +// hardware, so the mapping between TF GPU id and platform GPU id is not a 1:1 // mapping, see the example below. // // For example, assuming that in the machine we have GPU device with index 0, 1, // 2 and 3 (physical GPU id). Setting "CUDA_VISIBLE_DEVICES=1,2,3" will create -// the following mapping between CUDA GPU id and physical GPU id: +// the following mapping between platform GPU id and physical GPU id: // -// CUDA GPU id -> physical GPU id +// platform GPU id -> physical GPU id // 0 -> 1 // 1 -> 2 // 2 -> 3 @@ -56,32 +56,32 @@ namespace tensorflow { // // Assuming we configure the Session to create one BaseGPUDevice per GPU // hardware, then setting GPUOptions::visible_device_list to "2,0" will create -// the following mappting between TF GPU id and CUDA GPU id: +// the following mappting between TF GPU id and platform GPU id: // -// TF GPU id -> CUDA GPU ID +// TF GPU id -> platform GPU ID // 0 (i.e. /device:GPU:0) -> 2 // 1 (i.e. /device:GPU:1) -> 0 // -// Note that CUDA GPU id 1 is filtered out by GPUOptions::visible_device_list, -// so it won't be used by the TF process. +// Note that platform GPU id 1 is filtered out by +// GPUOptions::visible_device_list, so it won't be used by the TF process. // // On the other hand, if we configure it to create 2 BaseGPUDevice per GPU // hardware, then setting GPUOptions::visible_device_list to "2,0" will create -// the following mappting between TF GPU id and CUDA GPU id: +// the following mappting between TF GPU id and platform GPU id: // -// TF GPU id -> CUDA GPU ID +// TF GPU id -> platform GPU ID // 0 (i.e. /device:GPU:0) -> 2 // 1 (i.e. /device:GPU:1) -> 2 // 2 (i.e. /device:GPU:2) -> 0 // 3 (i.e. /device:GPU:3) -> 0 // -// We create strong-typed integer classes for both TF GPU id and CUDA GPU id to -// minimize programming errors and improve code readability. Except for the +// We create strong-typed integer classes for both TF GPU id and platform GPU id +// to minimize programming errors and improve code readability. Except for the // StreamExecutor interface (as we don't change its API), whenever we need a -// TF GPU id (or CUDA GPU id) we should use TfGpuId (or CudaGpuId) instead of a -// raw integer. +// TF GPU id (or platform GPU id) we should use TfGpuId (or PlatformGpuId) +// instead of a raw integer. TF_LIB_GTL_DEFINE_INT_TYPE(TfGpuId, int32); -TF_LIB_GTL_DEFINE_INT_TYPE(CudaGpuId, int32); +TF_LIB_GTL_DEFINE_INT_TYPE(PlatformGpuId, int32); } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc index b5099dc8ef..2b40730119 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc @@ -26,26 +26,27 @@ limitations under the License. namespace tensorflow { namespace { -// Manages the map between TfGpuId and CUDA GPU id. -class TfToCudaGpuIdMap { +// Manages the map between TfGpuId and platform GPU id. +class TfToPlatformGpuIdMap { public: - static TfToCudaGpuIdMap* singleton() { - static auto* id_map = new TfToCudaGpuIdMap; + static TfToPlatformGpuIdMap* singleton() { + static auto* id_map = new TfToPlatformGpuIdMap; return id_map; } - Status Insert(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id) LOCKS_EXCLUDED(mu_) { + Status Insert(TfGpuId tf_gpu_id, PlatformGpuId platform_gpu_id) + LOCKS_EXCLUDED(mu_) { std::pair<IdMapType::iterator, bool> result; { mutex_lock lock(mu_); - result = id_map_.insert({tf_gpu_id.value(), cuda_gpu_id.value()}); + result = id_map_.insert({tf_gpu_id.value(), platform_gpu_id.value()}); } - if (!result.second && cuda_gpu_id.value() != result.first->second) { + if (!result.second && platform_gpu_id.value() != result.first->second) { return errors::AlreadyExists( "TensorFlow device (GPU:", tf_gpu_id.value(), ") is being mapped to " "multiple CUDA devices (", - cuda_gpu_id.value(), " now, and ", result.first->second, + platform_gpu_id.value(), " now, and ", result.first->second, " previously), which is not supported. " "This may be the result of providing different GPU configurations " "(ConfigProto.gpu_options, for example different visible_device_list)" @@ -56,17 +57,17 @@ class TfToCudaGpuIdMap { return Status::OK(); } - bool Find(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id) const + bool Find(TfGpuId tf_gpu_id, PlatformGpuId* platform_gpu_id) const LOCKS_EXCLUDED(mu_) { mutex_lock lock(mu_); auto result = id_map_.find(tf_gpu_id.value()); if (result == id_map_.end()) return false; - *cuda_gpu_id = result->second; + *platform_gpu_id = result->second; return true; } private: - TfToCudaGpuIdMap() = default; + TfToPlatformGpuIdMap() = default; void TestOnlyReset() LOCKS_EXCLUDED(mu_) { mutex_lock lock(mu_); @@ -78,17 +79,18 @@ class TfToCudaGpuIdMap { IdMapType id_map_ GUARDED_BY(mu_); friend class ::tensorflow::GpuIdManager; - TF_DISALLOW_COPY_AND_ASSIGN(TfToCudaGpuIdMap); + TF_DISALLOW_COPY_AND_ASSIGN(TfToPlatformGpuIdMap); }; } // namespace -Status GpuIdManager::InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id, - CudaGpuId cuda_gpu_id) { - return TfToCudaGpuIdMap::singleton()->Insert(tf_gpu_id, cuda_gpu_id); +Status GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId tf_gpu_id, + PlatformGpuId platform_gpu_id) { + return TfToPlatformGpuIdMap::singleton()->Insert(tf_gpu_id, platform_gpu_id); } -Status GpuIdManager::TfToCudaGpuId(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id) { - if (TfToCudaGpuIdMap::singleton()->Find(tf_gpu_id, cuda_gpu_id)) { +Status GpuIdManager::TfToPlatformGpuId(TfGpuId tf_gpu_id, + PlatformGpuId* platform_gpu_id) { + if (TfToPlatformGpuIdMap::singleton()->Find(tf_gpu_id, platform_gpu_id)) { return Status::OK(); } return errors::NotFound("TensorFlow device GPU:", tf_gpu_id.value(), @@ -96,7 +98,7 @@ Status GpuIdManager::TfToCudaGpuId(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id) { } void GpuIdManager::TestOnlyReset() { - TfToCudaGpuIdMap::singleton()->TestOnlyReset(); + TfToPlatformGpuIdMap::singleton()->TestOnlyReset(); } } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h index 491d92ccdd..62df4310c4 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h +++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h @@ -21,15 +21,17 @@ limitations under the License. namespace tensorflow { -// Class that maintains a map from TfGpuId to CudaGpuId, and manages the +// Class that maintains a map from TfGpuId to PlatformGpuId, and manages the // translation between them. class GpuIdManager { public: - // Adds a mapping from tf_gpu_id to cuda_gpu_id. - static Status InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id); + // Adds a mapping from tf_gpu_id to platform_gpu_id. + static Status InsertTfPlatformGpuIdPair(TfGpuId tf_gpu_id, + PlatformGpuId platform_gpu_id); - // Gets the cuda_gpu_id associated with tf_gpu_id. Returns OK if found. - static Status TfToCudaGpuId(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id); + // Gets the platform_gpu_id associated with tf_gpu_id. Returns OK if found. + static Status TfToPlatformGpuId(TfGpuId tf_gpu_id, + PlatformGpuId* platform_gpu_id); // Clears the map. Used in unit tests only. static void TestOnlyReset(); diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc index a663ec7051..8bf3c6a308 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc @@ -22,38 +22,38 @@ limitations under the License. namespace tensorflow { namespace { -CudaGpuId TfToCudaGpuId(TfGpuId tf) { - CudaGpuId cuda; - TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf, &cuda)); - return cuda; +PlatformGpuId TfToPlatformGpuId(TfGpuId tf) { + PlatformGpuId platform_gpu_id; + TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf, &platform_gpu_id)); + return platform_gpu_id; } TEST(GpuIdManagerTest, Basics) { TfGpuId key_0(0); - CudaGpuId value_0(0); - TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0)); - EXPECT_EQ(value_0, TfToCudaGpuId(key_0)); + PlatformGpuId value_0(0); + TF_ASSERT_OK(GpuIdManager::InsertTfPlatformGpuIdPair(key_0, value_0)); + EXPECT_EQ(value_0, TfToPlatformGpuId(key_0)); // Multiple calls to map the same value is ok. - TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0)); - EXPECT_EQ(value_0, TfToCudaGpuId(key_0)); + TF_ASSERT_OK(GpuIdManager::InsertTfPlatformGpuIdPair(key_0, value_0)); + EXPECT_EQ(value_0, TfToPlatformGpuId(key_0)); // Map a different TfGpuId to a different value. TfGpuId key_1(3); - CudaGpuId value_1(2); - TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_1, value_1)); - EXPECT_EQ(value_1, TfToCudaGpuId(key_1)); + PlatformGpuId value_1(2); + TF_ASSERT_OK(GpuIdManager::InsertTfPlatformGpuIdPair(key_1, value_1)); + EXPECT_EQ(value_1, TfToPlatformGpuId(key_1)); // Mapping a different TfGpuId to the same value is ok. TfGpuId key_2(10); - TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_1)); - EXPECT_EQ(value_1, TfToCudaGpuId(key_2)); + TF_ASSERT_OK(GpuIdManager::InsertTfPlatformGpuIdPair(key_2, value_1)); + EXPECT_EQ(value_1, TfToPlatformGpuId(key_2)); // Mapping the same TfGpuId to a different value. - ASSERT_FALSE(GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_0).ok()); + ASSERT_FALSE(GpuIdManager::InsertTfPlatformGpuIdPair(key_2, value_0).ok()); // Getting a nonexistent mapping. - ASSERT_FALSE(GpuIdManager::TfToCudaGpuId(TfGpuId(100), &value_0).ok()); + ASSERT_FALSE(GpuIdManager::TfToPlatformGpuId(TfGpuId(100), &value_0).ok()); } } // namespace diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h index b9c66b3328..b1f10fb1dc 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h +++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h @@ -24,34 +24,37 @@ limitations under the License. namespace tensorflow { -// Utility methods for translation between Tensorflow GPU ids and CUDA GPU ids. +// Utility methods for translation between Tensorflow GPU ids and platform GPU +// ids. class GpuIdUtil { public: // Convenient methods for getting the associated executor given a TfGpuId or - // CudaGpuId. - static se::port::StatusOr<se::StreamExecutor*> ExecutorForCudaGpuId( - se::Platform* gpu_manager, CudaGpuId cuda_gpu_id) { - return gpu_manager->ExecutorForDevice(cuda_gpu_id.value()); + // PlatformGpuId. + static se::port::StatusOr<se::StreamExecutor*> ExecutorForPlatformGpuId( + se::Platform* gpu_manager, PlatformGpuId platform_gpu_id) { + return gpu_manager->ExecutorForDevice(platform_gpu_id.value()); } - static se::port::StatusOr<se::StreamExecutor*> ExecutorForCudaGpuId( - CudaGpuId cuda_gpu_id) { - return ExecutorForCudaGpuId(GPUMachineManager(), cuda_gpu_id); + static se::port::StatusOr<se::StreamExecutor*> ExecutorForPlatformGpuId( + PlatformGpuId platform_gpu_id) { + return ExecutorForPlatformGpuId(GPUMachineManager(), platform_gpu_id); } static se::port::StatusOr<se::StreamExecutor*> ExecutorForTfGpuId( TfGpuId tf_gpu_id) { - CudaGpuId cuda_gpu_id; - TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); - return ExecutorForCudaGpuId(cuda_gpu_id); + PlatformGpuId platform_gpu_id; + TF_RETURN_IF_ERROR( + GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); + return ExecutorForPlatformGpuId(platform_gpu_id); } - // Verify that the cuda_gpu_id associated with a TfGpuId is legitimate. + // Verify that the platform_gpu_id associated with a TfGpuId is legitimate. static void CheckValidTfGpuId(TfGpuId tf_gpu_id) { - CudaGpuId cuda_gpu_id; - TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); + PlatformGpuId platform_gpu_id; + TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); const int visible_device_count = GPUMachineManager()->VisibleDeviceCount(); - CHECK_LT(cuda_gpu_id.value(), visible_device_count) - << "cuda_gpu_id is outside discovered device range." - << " TF GPU id: " << tf_gpu_id << " CUDA GPU id: " << cuda_gpu_id + CHECK_LT(platform_gpu_id.value(), visible_device_count) + << "platform_gpu_id is outside discovered device range." + << " TF GPU id: " << tf_gpu_id + << " platform GPU id: " << platform_gpu_id << " visible device count: " << visible_device_count; } }; diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc index 9ec740fabe..3e95374fda 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc @@ -107,14 +107,15 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, return nullptr; } - CudaGpuId cuda_gpu_id; - TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); + PlatformGpuId platform_gpu_id; + TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); int bus_id = BusIdForGPU(tf_gpu_id); while (bus_id >= gpu_visitors_.size()) { gpu_visitors_.push_back({}); } GPUMemAllocator* sub_allocator = new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), cuda_gpu_id, + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, (options.per_process_gpu_memory_fraction() > 1.0 || options.experimental().use_unified_memory()), gpu_visitors_[bus_id], {}); @@ -125,20 +126,21 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, // If true, checks for memory overwrites by writing // distinctive patterns on both ends of allocated memory. if (useCudaMemoryGuardAllocator()) { - gpu_allocator = new GPUDebugAllocator(gpu_allocator, cuda_gpu_id); - gpu_allocator = new GPUNanResetAllocator(gpu_allocator, cuda_gpu_id); + gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_gpu_id); + gpu_allocator = new GPUNanResetAllocator(gpu_allocator, platform_gpu_id); } else if (useCudaMallocAllocator()) { // If true, passes all allocation requests through to cudaMalloc // useful for doing memory debugging with tools like cuda-memcheck // **WARNING** probably will not work in a multi-gpu scenario - gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, cuda_gpu_id); + gpu_allocator = + new GPUcudaMallocAllocator(gpu_allocator, platform_gpu_id); } Allocator* recording_allocator = nullptr; if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { ProcessState::MemDesc md; md.loc = ProcessState::MemDesc::GPU; - md.dev_index = cuda_gpu_id.value(); + md.dev_index = platform_gpu_id.value(); md.gpu_registered = false; md.nic_registered = true; recording_allocator = new internal::RecordingAllocator( |