diff options
25 files changed, 333 insertions, 299 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index b019c99882..f29f4d6deb 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -780,12 +780,12 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator( // If device is not set, use the first found GPU device for the conversion. for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) { TfGpuId tf_gpu_id(tf_gpu_id_value); - CudaGpuId cuda_gpu_id; - Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id); + PlatformGpuId platform_gpu_id; + Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id); if (s.ok()) { VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device " - << cuda_gpu_id.value(); - cuda_device_id = cuda_gpu_id.value(); + << platform_gpu_id.value(); + cuda_device_id = platform_gpu_id.value(); GPUOptions gpu_options; // If the TF to Cuda gpu id mapping exist, the device and corresponding // allocator must have been initialized already, so the diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc index 2b42d81f47..88cf8d5980 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc @@ -565,21 +565,22 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources( new TRTInt8Calibrator(device_buffers_, batch_size, name())); const string label(name()); auto segment_graph = &segment_graph_; - const int cuda_gpu_id = ctx->device()->tensorflow_gpu_device_info()->gpu_id; - if (cuda_gpu_id < 0) { + const int platform_gpu_id = + ctx->device()->tensorflow_gpu_device_info()->gpu_id; + if (platform_gpu_id < 0) { LOG(ERROR) << "Can't get gpu_device_info from context->device()"; return tensorflow::errors::InvalidArgument( "Context->device doesn't contain device info!"); } const int64 workspace_size_bytes = workspace_size_; cres->thr_.reset(new std::thread([cres, label, segment_graph, shapes, - cuda_gpu_id, workspace_size_bytes]() { - VLOG(0) << "Starting calibration thread on device " << cuda_gpu_id + platform_gpu_id, workspace_size_bytes]() { + VLOG(0) << "Starting calibration thread on device " << platform_gpu_id << ", Calibration Resource @ " << cres; - auto err = cudaSetDevice(cuda_gpu_id); + auto err = cudaSetDevice(platform_gpu_id); if (err != cudaSuccess) { // TODO(aaroey): should return error here. - LOG(ERROR) << "Couldn't set cuda device to " << cuda_gpu_id + LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id << " in calibration thread"; } // ConvertGraphDefToEngine() will try to build the engine. This thread diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc index 2d4c8d0201..c8db384b64 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc @@ -22,16 +22,17 @@ limitations under the License. namespace tensorflow { -GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, - const string& name) - : GPUBFCAllocator(cuda_gpu_id, total_memory, GPUOptions(), name) {} +GPUBFCAllocator::GPUBFCAllocator(PlatformGpuId platform_gpu_id, + size_t total_memory, const string& name) + : GPUBFCAllocator(platform_gpu_id, total_memory, GPUOptions(), name) {} -GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, +GPUBFCAllocator::GPUBFCAllocator(PlatformGpuId platform_gpu_id, + size_t total_memory, const GPUOptions& gpu_options, const string& name) : BFCAllocator( new GPUMemAllocator( - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(), + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), gpu_options.per_process_gpu_memory_fraction() > 1.0 || gpu_options.experimental().use_unified_memory()), total_memory, gpu_options.allow_growth(), name) {} diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h index f1cc2eace1..435ffb4959 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h @@ -35,11 +35,11 @@ namespace tensorflow { // algorithm. class GPUBFCAllocator : public BFCAllocator { public: - // 'cuda_gpu_id' refers to the ID of the GPU device within + // 'platform_gpu_id' refers to the ID of the GPU device within // the process and must reference a valid ID in the process. - GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, + GPUBFCAllocator(PlatformGpuId platform_gpu_id, size_t total_memory, const string& name); - GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory, + GPUBFCAllocator(PlatformGpuId platform_gpu_id, size_t total_memory, const GPUOptions& gpu_options, const string& name); virtual ~GPUBFCAllocator() {} diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc index 67caeb3495..518ccba580 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc @@ -46,7 +46,7 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use, } TEST(GPUBFCAllocatorTest, NoDups) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); // Allocate a lot of raw pointers @@ -75,7 +75,7 @@ TEST(GPUBFCAllocatorTest, NoDups) { } TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1 << 30, "GPU_0_bfc"); // Allocate 256 raw pointers of sizes between 100 bytes and about // a meg random::PhiloxRandom philox(123, 17); @@ -133,7 +133,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { } TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1 << 30, "GPU_0_bfc"); CheckStats(&a, 0, 0, 0, 0); float* first_ptr = a.Allocate<float>(1024); @@ -168,18 +168,18 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { } TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1 << 30, "GPU_0_bfc"); float* ptr = a.Allocate<float>(0); EXPECT_EQ(nullptr, ptr); } TEST(GPUBFCAllocatorTest, TracksSizes) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1 << 30, "GPU_0_bfc"); EXPECT_EQ(true, a.TracksAllocationSizes()); } TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1 << 30, "GPU_0_bfc"); float* t1 = a.Allocate<float>(1); EXPECT_EQ(4, a.RequestedSize(t1)); EXPECT_EQ(256, a.AllocatedSize(t1)); @@ -188,7 +188,7 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) { // Configure a 1MiB byte limit - GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1 << 20, "GPU_0_bfc"); float* first_ptr = a.Allocate<float>(1 << 6); float* second_ptr = a.Allocate<float>(1 << 20); @@ -203,7 +203,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { options.set_allow_growth(true); // Max of 2GiB, but starts out small. - GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1LL << 31, options, "GPU_0_bfc"); // Allocate 10 raw pointers of sizes between 100 bytes and about // 64 megs. @@ -264,8 +264,8 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { } TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { - GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); - GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1UL << 60, "GPU_0_bfc"); + GPUBFCAllocator b(PlatformGpuId(0), 1UL << 60, "GPU_0_bfc"); void* amem = a.AllocateRaw(1, 1); void* bmem = b.AllocateRaw(1, 1 << 30); a.DeallocateRaw(amem); @@ -273,7 +273,7 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { } static void BM_Allocation(int iters) { - GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1uLL << 33, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<size_t> sizes = {256, 4096, 16384, 524288, 512, 1048576, 10485760, 104857600, @@ -289,7 +289,7 @@ static void BM_Allocation(int iters) { BENCHMARK(BM_Allocation); static void BM_AllocationThreaded(int iters, int num_threads) { - GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1uLL << 33, "GPU_0_bfc"); thread::ThreadPool pool(Env::Default(), "test", num_threads); std::atomic_int_fast32_t count(iters); mutex done_lock; @@ -325,7 +325,7 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16); // A more complex benchmark that defers deallocation of an object for // "delay" allocations. static void BM_AllocationDelayed(int iters, int delay) { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1 << 30, "GPU_0_bfc"); // Exercise a few different allocation sizes std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024}; int size_index = 0; @@ -363,7 +363,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { // only methods inside this class can access private members of BFCAllocator. void TestBinDebugInfo() { - GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1 << 30, "GPU_0_bfc"); std::vector<void*> initial_ptrs; std::vector<size_t> initial_ptrs_allocated_sizes; @@ -441,7 +441,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { } void TestLog2FloorNonZeroSlow() { - GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc"); + GPUBFCAllocator a(PlatformGpuId(0), 1 /* total_memory */, "GPU_0_bfc"); EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0)); EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1)); EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2)); diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc index 934a57a5fb..553a5628ad 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc @@ -28,9 +28,10 @@ limitations under the License. namespace tensorflow { GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator, - CudaGpuId cuda_gpu_id) + PlatformGpuId platform_gpu_id) : base_allocator_(allocator) { - stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + stream_exec_ = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); } GPUcudaMallocAllocator::~GPUcudaMallocAllocator() { delete base_allocator_; } diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h index 856fdc34b4..8f38cc5a18 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h @@ -32,7 +32,7 @@ namespace tensorflow { class GPUcudaMallocAllocator : public VisitableAllocator { public: explicit GPUcudaMallocAllocator(VisitableAllocator* allocator, - CudaGpuId cuda_gpu_id); + PlatformGpuId platform_gpu_id); ~GPUcudaMallocAllocator() override; string Name() override { return "gpu_debug"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc index e4c834b30d..badb021aa5 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc @@ -74,9 +74,10 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) { // GPUDebugAllocator // ----------------------------------------------------------------------------- GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator, - CudaGpuId cuda_gpu_id) + PlatformGpuId platform_gpu_id) : base_allocator_(allocator) { - stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + stream_exec_ = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); } GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; } @@ -159,9 +160,10 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) { // GPUNanResetAllocator // ----------------------------------------------------------------------------- GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator, - CudaGpuId cuda_gpu_id) + PlatformGpuId platform_gpu_id) : base_allocator_(allocator) { - stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + stream_exec_ = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); } GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; } diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h index 0f9b72040c..9e007ed8c1 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h @@ -34,7 +34,7 @@ namespace tensorflow { class GPUDebugAllocator : public VisitableAllocator { public: explicit GPUDebugAllocator(VisitableAllocator* allocator, - CudaGpuId cuda_gpu_id); + PlatformGpuId platform_gpu_id); ~GPUDebugAllocator() override; string Name() override { return "gpu_debug"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; @@ -66,7 +66,7 @@ class GPUDebugAllocator : public VisitableAllocator { class GPUNanResetAllocator : public VisitableAllocator { public: explicit GPUNanResetAllocator(VisitableAllocator* allocator, - CudaGpuId cuda_gpu_id); + PlatformGpuId platform_gpu_id); ~GPUNanResetAllocator() override; string Name() override { return "gpu_nan_reset"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override; diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc index 236a0afa0b..bc3e3a8c35 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc @@ -34,10 +34,11 @@ namespace tensorflow { namespace { TEST(GPUDebugAllocatorTest, OverwriteDetection_None) { - const CudaGpuId cuda_gpu_id(0); - GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), - cuda_gpu_id); - auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + const PlatformGpuId platform_gpu_id(0); + GPUDebugAllocator a(new GPUBFCAllocator(platform_gpu_id, 1 << 30, ""), + platform_gpu_id); + auto stream_exec = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); for (int s : {8}) { std::vector<int64> cpu_array(s); @@ -58,11 +59,11 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) { for (int s : {8, 211}) { EXPECT_DEATH( { - const CudaGpuId cuda_gpu_id(0); - GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), - cuda_gpu_id); + const PlatformGpuId platform_gpu_id(0); + GPUDebugAllocator a(new GPUBFCAllocator(platform_gpu_id, 1 << 30, ""), + platform_gpu_id); auto stream_exec = - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); std::vector<int64> cpu_array(s); memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64)); @@ -91,11 +92,11 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) { for (int s : {8, 22}) { EXPECT_DEATH( { - const CudaGpuId cuda_gpu_id(0); - GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), - cuda_gpu_id); + const PlatformGpuId platform_gpu_id(0); + GPUDebugAllocator a(new GPUBFCAllocator(platform_gpu_id, 1 << 30, ""), + platform_gpu_id); auto stream_exec = - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); std::vector<int64> cpu_array(s); memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64)); @@ -121,10 +122,11 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) { } TEST(GPUDebugAllocatorTest, ResetToNan) { - const CudaGpuId cuda_gpu_id(0); - GPUNanResetAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), - cuda_gpu_id); - auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + const PlatformGpuId platform_gpu_id(0); + GPUNanResetAllocator a(new GPUBFCAllocator(platform_gpu_id, 1 << 30, ""), + platform_gpu_id); + auto stream_exec = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); std::vector<float> cpu_array(1024); std::vector<float> cpu_array_result(1024); @@ -161,13 +163,14 @@ TEST(GPUDebugAllocatorTest, ResetToNan) { } TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { - const CudaGpuId cuda_gpu_id(0); + const PlatformGpuId platform_gpu_id(0); // NaN reset must be the outer-most allocator. GPUNanResetAllocator a( - new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), - cuda_gpu_id), - cuda_gpu_id); - auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + new GPUDebugAllocator(new GPUBFCAllocator(platform_gpu_id, 1 << 30, ""), + platform_gpu_id), + platform_gpu_id); + auto stream_exec = + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); std::vector<float> cpu_array(1024); std::vector<float> cpu_array_result(1024); @@ -204,18 +207,18 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) { } TEST(GPUDebugAllocatorTest, TracksSizes) { - const CudaGpuId cuda_gpu_id(0); - GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), - cuda_gpu_id); + const PlatformGpuId platform_gpu_id(0); + GPUDebugAllocator a(new GPUBFCAllocator(platform_gpu_id, 1 << 30, ""), + platform_gpu_id); EXPECT_EQ(true, a.TracksAllocationSizes()); } TEST(GPUDebugAllocatorTest, AllocatedVsRequested) { - const CudaGpuId cuda_gpu_id(0); + const PlatformGpuId platform_gpu_id(0); GPUNanResetAllocator a( - new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""), - cuda_gpu_id), - cuda_gpu_id); + new GPUDebugAllocator(new GPUBFCAllocator(platform_gpu_id, 1 << 30, ""), + platform_gpu_id), + platform_gpu_id); float* t1 = a.Allocate<float>(1); EXPECT_EQ(4, a.RequestedSize(t1)); EXPECT_EQ(256, a.AllocatedSize(t1)); diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index 2763ac0d4a..4bf23bc017 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -105,9 +105,9 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface { reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize); stream_ = cuda_stream; allocator_ = alloc; - CudaGpuId cuda_gpu_id; - TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); - device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id.value()]; + PlatformGpuId platform_gpu_id; + TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); + device_prop_ = &Eigen::m_deviceProperties[platform_gpu_id.value()]; } const cudaStream_t& stream() const override { return *stream_; } @@ -332,9 +332,10 @@ Status BaseGPUDevice::Init(const SessionOptions& options) { gpu_device_info_->stream = streams_[0]->compute; gpu_device_info_->default_context = device_contexts_[0]; gpu_device_info_->event_mgr = em_.get(); - CudaGpuId cuda_gpu_id; - TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id_, &cuda_gpu_id)); - gpu_device_info_->gpu_id = cuda_gpu_id.value(); + PlatformGpuId platform_gpu_id; + TF_RETURN_IF_ERROR( + GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id)); + gpu_device_info_->gpu_id = platform_gpu_id.value(); set_tensorflow_gpu_device_info(gpu_device_info_); // Whether and how the GPU device uses its own threadpool. @@ -690,9 +691,9 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice { Eigen::GpuDevice device_; }; -// Parse 'visible_device_list' into a list of CUDA GPU ids. +// Parse 'visible_device_list' into a list of platform GPU ids. Status ParseVisibleDeviceList(const string& visible_device_list, - std::vector<CudaGpuId>* visible_gpu_order) { + std::vector<PlatformGpuId>* visible_gpu_order) { visible_gpu_order->clear(); se::Platform* gpu_manager = GPUMachineManager(); @@ -707,26 +708,28 @@ Status ParseVisibleDeviceList(const string& visible_device_list, } else { const std::vector<string> order_str = str_util::Split(visible_device_list, ','); - for (const string& cuda_gpu_id_str : order_str) { - int32 cuda_gpu_id; - if (!strings::safe_strto32(cuda_gpu_id_str, &cuda_gpu_id)) { + for (const string& platform_gpu_id_str : order_str) { + int32 platform_gpu_id; + if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) { return errors::InvalidArgument( "Could not parse entry in 'visible_device_list': '", - cuda_gpu_id_str, "'. visible_device_list = ", visible_device_list); + platform_gpu_id_str, "'. visible_device_list = ", + visible_device_list); } - if (cuda_gpu_id < 0 || cuda_gpu_id >= gpu_manager->VisibleDeviceCount()) { + if (platform_gpu_id < 0 || + platform_gpu_id >= gpu_manager->VisibleDeviceCount()) { return errors::InvalidArgument( - "'visible_device_list' listed an invalid GPU id '", cuda_gpu_id, + "'visible_device_list' listed an invalid GPU id '", platform_gpu_id, "' but visible device count is ", gpu_manager->VisibleDeviceCount()); } - visible_gpu_order->push_back(CudaGpuId(cuda_gpu_id)); + visible_gpu_order->push_back(PlatformGpuId(platform_gpu_id)); } } // Validate no repeats. - std::set<CudaGpuId> visible_device_set(visible_gpu_order->begin(), - visible_gpu_order->end()); + std::set<PlatformGpuId> visible_device_set(visible_gpu_order->begin(), + visible_gpu_order->end()); if (visible_device_set.size() != visible_gpu_order->size()) { return errors::InvalidArgument( "visible_device_list contained a duplicate entry: ", @@ -737,8 +740,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list, Status VerifyVirtualDeviceSettings( const size_t num_gpus_to_use, const GPUOptions& gpu_options, - const std::vector<CudaGpuId>& visible_gpu_order, - const std::vector<CudaGpuId>& valid_cuda_gpu_ids) { + const std::vector<PlatformGpuId>& visible_gpu_order, + const std::vector<PlatformGpuId>& valid_platform_gpu_ids) { const auto& virtual_devices = gpu_options.experimental().virtual_devices(); CHECK(!virtual_devices.empty()); if (gpu_options.per_process_gpu_memory_fraction() > 0) { @@ -760,11 +763,11 @@ Status VerifyVirtualDeviceSettings( " #GPUs in visible_device_list: ", visible_gpu_order.size(), " virtual_devices.size(): ", virtual_devices.size()); } - if (valid_cuda_gpu_ids.size() != virtual_devices.size()) { + if (valid_platform_gpu_ids.size() != virtual_devices.size()) { return errors::Unknown( "The number of valid GPUs doesn't match the number of elements in " "the virtual_devices list.", - " #valid GPUs: ", valid_cuda_gpu_ids.size(), + " #valid GPUs: ", valid_platform_gpu_ids.size(), " virtual_devices.size(): ", virtual_devices.size()); } return Status::OK(); @@ -806,18 +809,18 @@ int64 MinSystemMemory(int64 available_memory) { } // Get the memory limit for the virtual device being created on GPU with -// 'cuda_gpu_id', when that virtual device is the only virtual device being +// 'platform_gpu_id', when that virtual device is the only virtual device being // created on that GPU. Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options, - CudaGpuId cuda_gpu_id, + PlatformGpuId platform_gpu_id, int64* memory_limit) { int64 total_memory = 0; int64 available_memory = 0; se::StreamExecutor* se = - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) { return errors::Unknown("Failed to query available memory for GPU ", - cuda_gpu_id.value()); + platform_gpu_id.value()); } int64 allocated_memory = 0; @@ -916,8 +919,8 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, num_gpus_to_use = iter->second; } const auto& gpu_options = options.config.gpu_options(); - std::vector<CudaGpuId> visible_gpu_order; - std::vector<CudaGpuId> valid_cuda_gpu_ids; + std::vector<PlatformGpuId> visible_gpu_order; + std::vector<PlatformGpuId> valid_platform_gpu_ids; // If we aren't going to use any GPUs, don't initialize them. // We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0, // because it treats an empty gpu_options.visible_device_list as 'all GPUs are @@ -926,12 +929,12 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(), &visible_gpu_order)); TF_RETURN_IF_ERROR( - GetValidDeviceIds(visible_gpu_order, &valid_cuda_gpu_ids)); + GetValidDeviceIds(visible_gpu_order, &valid_platform_gpu_ids)); } - if (num_gpus_to_use > valid_cuda_gpu_ids.size()) { - num_gpus_to_use = valid_cuda_gpu_ids.size(); + if (num_gpus_to_use > valid_platform_gpu_ids.size()) { + num_gpus_to_use = valid_platform_gpu_ids.size(); } - if (!valid_cuda_gpu_ids.empty()) { + if (!valid_platform_gpu_ids.empty()) { // Save the original device. int original_device = 0; cudaError_t err = cudaGetDevice(&original_device); @@ -941,17 +944,18 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, } // Force to implicitly initialize CUDA runtime on each valid GPU before // CreateGPUDevice(). - for (CudaGpuId cuda_gpu_id : valid_cuda_gpu_ids) { - err = cudaSetDevice(cuda_gpu_id.value()); + for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) { + err = cudaSetDevice(platform_gpu_id.value()); if (err != cudaSuccess) { - return errors::Internal("cudaSetDevice() on GPU:", cuda_gpu_id.value(), - " failed. Status: ", cudaGetErrorString(err)); + return errors::Internal("cudaSetDevice() on GPU:", + platform_gpu_id.value(), " failed. Status: ", + cudaGetErrorString(err)); } err = cudaFree(nullptr); if (err != cudaSuccess) { - return errors::Internal( - "CUDA runtime implicit initialization on GPU:", cuda_gpu_id.value(), - " failed. Status: ", cudaGetErrorString(err)); + return errors::Internal("CUDA runtime implicit initialization on GPU:", + platform_gpu_id.value(), " failed. Status: ", + cudaGetErrorString(err)); } } // Reset to the original device. @@ -977,10 +981,10 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, LOG(INFO) << line_buf; for (int i = 0; i < visible_gpu_order.size(); ++i) { line_buf = strings::StrCat(visible_gpu_order[i].value(), ": "); - CudaGpuId cuda_id_i = visible_gpu_order[i]; + PlatformGpuId gpu_id_i = visible_gpu_order[i]; for (int j = 0; j < visible_gpu_order.size(); ++j) { - CudaGpuId cuda_id_j = visible_gpu_order[j]; - if (im.directed_links.find({cuda_id_i, cuda_id_j}) != + PlatformGpuId gpu_id_j = visible_gpu_order[j]; + if (im.directed_links.find({gpu_id_i, gpu_id_j}) != im.directed_links.end()) { line_buf.append("Y "); } else { @@ -993,22 +997,23 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, const auto& virtual_devices = gpu_options.experimental().virtual_devices(); if (!virtual_devices.empty()) { - TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings( - num_gpus_to_use, gpu_options, visible_gpu_order, valid_cuda_gpu_ids)); + TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(num_gpus_to_use, gpu_options, + visible_gpu_order, + valid_platform_gpu_ids)); // We've verified that num_gpus_to_use >= virtual_devices.size(). num_gpus_to_use = virtual_devices.size(); CHECK(gpu_options.visible_device_list().empty() || - valid_cuda_gpu_ids == visible_gpu_order); + valid_platform_gpu_ids == visible_gpu_order); } int next_tf_gpu_id = 0; std::vector<int64> memory_limit_bytes; for (int i = 0; i < num_gpus_to_use; ++i) { - const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i]; + const PlatformGpuId platform_gpu_id = valid_platform_gpu_ids[i]; if (virtual_devices.empty() || virtual_devices.Get(i).memory_limit_mb_size() == 0) { int64 single_virtual_device_memory_limit = 0; TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit( - gpu_options, cuda_gpu_id, &single_virtual_device_memory_limit)); + gpu_options, platform_gpu_id, &single_virtual_device_memory_limit)); memory_limit_bytes.push_back(single_virtual_device_memory_limit); } else { const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb(); @@ -1021,7 +1026,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, TfGpuId tf_gpu_id(next_tf_gpu_id); ++next_tf_gpu_id; TF_RETURN_IF_ERROR( - GpuIdManager::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id)); + GpuIdManager::InsertTfPlatformGpuIdPair(tf_gpu_id, platform_gpu_id)); } } const int num_tf_gpus = next_tf_gpu_id; @@ -1046,7 +1051,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, return Status::OK(); } -static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id, +static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id, const se::DeviceDescription& desc) { int cc_major; int cc_minor; @@ -1055,9 +1060,8 @@ static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id, cc_minor = 0; } // LINT.IfChange - return strings::StrCat("device: ", cuda_gpu_id.value(), - ", name: ", desc.name(), - ", pci bus id: ", desc.pci_bus_id(), + return strings::StrCat("device: ", platform_gpu_id.value(), ", name: ", + desc.name(), ", pci bus id: ", desc.pci_bus_id(), ", compute capability: ", cc_major, ".", cc_minor); // LINT.ThenChange(//tensorflow/python/platform/test.py) } @@ -1072,12 +1076,13 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options, const string device_name = strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value()); GpuIdUtil::CheckValidTfGpuId(tf_gpu_id); - CudaGpuId cuda_gpu_id; - TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); + PlatformGpuId platform_gpu_id; + TF_RETURN_IF_ERROR( + GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); int numa_node = dev_locality.numa_node(); se::StreamExecutor* se = - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); const se::DeviceDescription& desc = se->GetDeviceDescription(); GPUProcessState* process_state = GPUProcessState::singleton(); Allocator* gpu_allocator = process_state->GetGPUAllocator( @@ -1098,11 +1103,11 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options, // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit. BaseGPUDevice* gpu_device = CreateGPUDevice( options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality, - tf_gpu_id, GetShortDeviceDescription(cuda_gpu_id, desc), gpu_allocator, - ProcessState::singleton()->GetCPUAllocator(numa_node)); + tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, desc), + gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node)); LOG(INFO) << "Created TensorFlow device (" << device_name << " with " << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU (" - << GetShortDeviceDescription(cuda_gpu_id, desc) << ")"; + << GetShortDeviceDescription(platform_gpu_id, desc) << ")"; TF_RETURN_IF_ERROR(gpu_device->Init(options)); devices->push_back(gpu_device); @@ -1110,18 +1115,21 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options, } namespace { -std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> +std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>> GetPeerAccessMap(se::Platform* platform, - const std::vector<CudaGpuId>& visible_gpu_order) { - std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> map( - new std::map<std::pair<CudaGpuId, CudaGpuId>, bool>); - for (CudaGpuId cuda_gpu_i : visible_gpu_order) { - for (CudaGpuId cuda_gpu_j : visible_gpu_order) { + const std::vector<PlatformGpuId>& visible_gpu_order) { + std::unique_ptr<std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>> map( + new std::map<std::pair<PlatformGpuId, PlatformGpuId>, bool>); + for (PlatformGpuId platform_gpu_i : visible_gpu_order) { + for (PlatformGpuId platform_gpu_j : visible_gpu_order) { se::StreamExecutor* from = - GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_i) + .ValueOrDie(); se::StreamExecutor* to = - GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie(); - (*map)[{cuda_gpu_i, cuda_gpu_j}] = from->CanEnablePeerAccessTo(to); + GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_j) + .ValueOrDie(); + (*map)[{platform_gpu_i, platform_gpu_j}] = + from->CanEnablePeerAccessTo(to); } } @@ -1131,19 +1139,19 @@ GetPeerAccessMap(se::Platform* platform, } // namespace Status BaseGPUDeviceFactory::GetInterconnectMaps( - const std::vector<CudaGpuId>& visible_gpu_order, se::Platform* gpu_manager, - std::vector<InterconnectMap>* maps) { + const std::vector<PlatformGpuId>& visible_gpu_order, + se::Platform* gpu_manager, std::vector<InterconnectMap>* maps) { // The default interconnect map is obtained from the StreamExecutor. auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order); maps->resize(1); InterconnectMap& imap = maps->at(0); imap.name = "StreamExecutor"; imap.strength = InterconnectMap::kStreamExecutorStrength; - for (CudaGpuId cuda_id_i : visible_gpu_order) { - for (CudaGpuId cuda_id_j : visible_gpu_order) { - if (cuda_id_i == cuda_id_j) continue; - if ((*access_map)[{cuda_id_i, cuda_id_j}]) { - imap.directed_links.insert({cuda_id_i, cuda_id_j}); + for (PlatformGpuId gpu_id_i : visible_gpu_order) { + for (PlatformGpuId gpu_id_j : visible_gpu_order) { + if (gpu_id_i == gpu_id_j) continue; + if ((*access_map)[{gpu_id_i, gpu_id_j}]) { + imap.directed_links.insert({gpu_id_i, gpu_id_j}); } } } @@ -1158,13 +1166,14 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( all_tf_gpu_ids.push_back(TfGpuId(i)); } for (TfGpuId tf_gpu_id : all_tf_gpu_ids) { - CudaGpuId cuda_gpu_id; - TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); + PlatformGpuId platform_gpu_id; + TF_RETURN_IF_ERROR( + GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); // Get GPU bus_id from its reported NUMA affinity. Because GPUs are // virtualized in some environments, we can't just use the GPU id. // NUMA locales are indexed from 0, buses are indexed from 1. se::StreamExecutor* se = - GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); const se::DeviceDescription& desc = se->GetDeviceDescription(); int numa_node = desc.numa_node(); if (numa_node < 0) { @@ -1174,7 +1183,8 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( // may run into trouble later with data transfer operations. The // trouble may manifest as slower than expected performance, or // outright failures. - LOG(INFO) << "Could not identify NUMA node of CUDA gpu id " << cuda_gpu_id + LOG(INFO) << "Could not identify NUMA node of platform GPU id " + << platform_gpu_id << ", defaulting to 0. Your kernel may not have been built " << "with NUMA support."; numa_node = 0; @@ -1187,10 +1197,10 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( LocalLinks* links = dev_locality.mutable_links(); for (const InterconnectMap& imap : interconnects) { for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) { - CudaGpuId cuda_gpu_dst; + PlatformGpuId platform_gpu_dst; TF_RETURN_IF_ERROR( - GpuIdManager::TfToCudaGpuId(tf_gpu_dst, &cuda_gpu_dst)); - if (imap.directed_links.find({cuda_gpu_id, cuda_gpu_dst}) != + GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst)); + if (imap.directed_links.find({platform_gpu_id, platform_gpu_dst}) != imap.directed_links.end()) { InterconnectLink* ilink = links->add_link(); ilink->set_device_id(tf_gpu_dst.value()); @@ -1204,10 +1214,10 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( // add high strength links to the others. for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) { if (tf_gpu_id == tf_gpu_dst) continue; - CudaGpuId cuda_gpu_dst; + PlatformGpuId platform_gpu_dst; TF_RETURN_IF_ERROR( - GpuIdManager::TfToCudaGpuId(tf_gpu_dst, &cuda_gpu_dst)); - if (cuda_gpu_id == cuda_gpu_dst) { + GpuIdManager::TfToPlatformGpuId(tf_gpu_dst, &platform_gpu_dst)); + if (platform_gpu_id == platform_gpu_dst) { InterconnectLink* ilink = links->add_link(); ilink->set_device_id(tf_gpu_dst.value()); ilink->set_type("SAME_DEVICE"); @@ -1216,9 +1226,9 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( } (*localities)[tf_gpu_id] = dev_locality; - VLOG(1) << "GPUDevice CudaGpuId " << cuda_gpu_id << " TfGpuId " << tf_gpu_id - << " on bus " << dev_locality.bus_id() << " numa: " << numa_node - << " pci: " << desc.pci_bus_id() + VLOG(1) << "GPUDevice PlatformGpuId " << platform_gpu_id << " TfGpuId " + << tf_gpu_id << " on bus " << dev_locality.bus_id() + << " numa: " << numa_node << " pci: " << desc.pci_bus_id() << " DeviceLocality: " << dev_locality.DebugString(); } return Status::OK(); @@ -1226,14 +1236,14 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities( static int GetDefaultMinGPUMultiprocessorCount( se::Platform* gpu_manager, - const std::vector<CudaGpuId>& visible_gpu_order) { + const std::vector<PlatformGpuId>& visible_gpu_order) { static const int kDefaultMinGPUMultiprocessorCount = 8; // Find the highest multi-processor count across all visible GPUs. int max_count = -1; for (int i = 0; i < visible_gpu_order.size(); ++i) { auto exec_status = - GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_order[i]); + GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_order[i]); if (!exec_status.ok()) { continue; } @@ -1252,7 +1262,7 @@ static int GetDefaultMinGPUMultiprocessorCount( static int GetMinGPUMultiprocessorCount( se::Platform* gpu_manager, - const std::vector<CudaGpuId>& visible_gpu_order) { + const std::vector<PlatformGpuId>& visible_gpu_order) { const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT"); if (tf_min_gpu_core_count == nullptr || @@ -1330,18 +1340,20 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() { } Status EnablePeerAccess(se::Platform* platform, - const std::vector<CudaGpuId>& visible_gpu_order) { + const std::vector<PlatformGpuId>& visible_gpu_order) { int possible_peer_count = 0; int enabled_peer_count = 0; for (int i = 0; i < visible_gpu_order.size(); ++i) { - const CudaGpuId cuda_gpu_i = visible_gpu_order[i]; + const PlatformGpuId platform_gpu_i = visible_gpu_order[i]; for (int j = 0; j < visible_gpu_order.size(); ++j) { - const CudaGpuId cuda_gpu_j = visible_gpu_order[j]; + const PlatformGpuId platform_gpu_j = visible_gpu_order[j]; // We have already validated that ExecutorForDevice() calls return OK. se::StreamExecutor* from = - GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_i) + .ValueOrDie(); se::StreamExecutor* to = - GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(platform, platform_gpu_j) + .ValueOrDie(); if (from->CanEnablePeerAccessTo(to)) { ++possible_peer_count; @@ -1349,7 +1361,8 @@ Status EnablePeerAccess(se::Platform* platform, if (!status.ok()) { LOG(WARNING) << "Unable to enable peer access between device ordinals " - << cuda_gpu_i << " and " << cuda_gpu_j << ", status: " << status; + << platform_gpu_i << " and " << platform_gpu_j + << ", status: " << status; } else { ++enabled_peer_count; } @@ -1372,22 +1385,23 @@ Status EnablePeerAccess(se::Platform* platform, } // namespace Status BaseGPUDeviceFactory::GetValidDeviceIds( - const std::vector<CudaGpuId>& visible_gpu_order, - std::vector<CudaGpuId>* ids) { + const std::vector<PlatformGpuId>& visible_gpu_order, + std::vector<PlatformGpuId>* ids) { se::Platform* gpu_manager = GPUMachineManager(); bool new_gpu_found = false; for (int i = 0; i < visible_gpu_order.size(); ++i) { - const CudaGpuId cuda_gpu_id = visible_gpu_order[i]; + const PlatformGpuId visible_gpu_id = visible_gpu_order[i]; - // Only perform this once per visible cuda gpu id. - if (visible_gpu_initialized_[cuda_gpu_id.value()]) { + // Only perform this once per visible platform gpu id. + if (visible_gpu_initialized_[visible_gpu_id.value()]) { continue; } - visible_gpu_initialized_[cuda_gpu_id.value()] = true; + visible_gpu_initialized_[visible_gpu_id.value()] = true; new_gpu_found = true; - auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id); + auto executor = + GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_id); if (!executor.ok()) { return executor.status(); } @@ -1435,9 +1449,9 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds( // Filter out devices that don't have the right capability or power. for (int i = 0; i < visible_gpu_order.size(); ++i) { - const CudaGpuId visible_gpu_id = visible_gpu_order[i]; + const PlatformGpuId visible_gpu_id = visible_gpu_order[i]; auto exec_status = - GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_id); + GpuIdUtil::ExecutorForPlatformGpuId(gpu_manager, visible_gpu_id); if (!exec_status.ok()) { LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id << " whose executor is in invalid state: " @@ -1486,7 +1500,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds( if (!ids->empty()) { std::vector<int> raw_ids(ids->size()); std::transform(ids->begin(), ids->end(), raw_ids.begin(), - [](CudaGpuId id) -> int { return id.value(); }); + [](PlatformGpuId id) -> int { return id.value(); }); LOG(INFO) << "Adding visible gpu devices: " << str_util::Join(raw_ids, ", "); } diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h index 56d03d7a8c..684cc0c1de 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.h +++ b/tensorflow/core/common_runtime/gpu/gpu_device.h @@ -89,12 +89,12 @@ class BaseGPUDevice : public LocalDevice { void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, DeviceContext* dc, Allocator* allocator) override; - // Returns the CUDA GPU id of this device within the native driver system; + // Returns the platform GPU id of this device within the native driver system; // e.g., for CUDA this is the ordinal of the GPU within the system. int gpu_id() const { - CudaGpuId cuda_gpu_id; - TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id_, &cuda_gpu_id)); - return cuda_gpu_id.value(); + PlatformGpuId platform_gpu_id; + TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id_, &platform_gpu_id)); + return platform_gpu_id.value(); } // The executor that provides control for the device; e.g., for CUDA this @@ -168,14 +168,14 @@ class BaseGPUDeviceFactory : public DeviceFactory { int32 strength; static const int kSameDeviceStrength; static const int kStreamExecutorStrength; - std::set<std::pair<CudaGpuId, CudaGpuId>> directed_links; + std::set<std::pair<PlatformGpuId, PlatformGpuId>> directed_links; }; protected: // Populates *maps with interconnect maps for all local direct access // pathways between GPUs. virtual Status GetInterconnectMaps( - const std::vector<CudaGpuId>& visible_gpu_order, + const std::vector<PlatformGpuId>& visible_gpu_order, se::Platform* gpu_manager, std::vector<InterconnectMap>* maps); struct TfGpuIdHash { @@ -207,16 +207,16 @@ class BaseGPUDeviceFactory : public DeviceFactory { Allocator* gpu_allocator, Allocator* cpu_allocator) = 0; - // Returns into 'ids' the list of valid CUDA GPU ids, in the order that + // Returns into 'ids' the list of valid platform GPU ids, in the order that // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc, // based upon 'visible_gpu_order' which was generated by parsing // GPUOptions::visible_device_list which is a comma-separated list of CUDA GPU // ids. - Status GetValidDeviceIds(const std::vector<CudaGpuId>& visible_gpu_order, - std::vector<CudaGpuId>* ids); + Status GetValidDeviceIds(const std::vector<PlatformGpuId>& visible_gpu_order, + std::vector<PlatformGpuId>* ids); - // visible_gpu_initialized_[cuda_gpu_id] is true if visible GPU cuda_gpu_id - // has been initialized by the process. + // visible_gpu_initialized_[platform_gpu_id] is true if visible GPU + // platform_gpu_id has been initialized by the process. std::unordered_map<int, bool> visible_gpu_initialized_; }; diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc index daf59f0560..36294094e9 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc @@ -30,18 +30,21 @@ namespace tensorflow { namespace { const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0"; -int64 GetTotalGPUMemory(CudaGpuId gpu_id) { +int64 GetTotalGPUMemory(PlatformGpuId gpu_id) { se::StreamExecutor* se = - GpuIdUtil::ExecutorForCudaGpuId(GPUMachineManager(), gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(GPUMachineManager(), gpu_id) + .ValueOrDie(); int64 total_memory, available_memory; CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory)); return total_memory; } -Status GetComputeCapability(CudaGpuId gpu_id, int* cc_major, int* cc_minor) { +Status GetComputeCapability(PlatformGpuId gpu_id, int* cc_major, + int* cc_minor) { se::StreamExecutor* se = - GpuIdUtil::ExecutorForCudaGpuId(GPUMachineManager(), gpu_id).ValueOrDie(); + GpuIdUtil::ExecutorForPlatformGpuId(GPUMachineManager(), gpu_id) + .ValueOrDie(); if (!se->GetDeviceDescription().cuda_compute_capability(cc_major, cc_minor)) { *cc_major = 0; *cc_minor = 0; @@ -223,7 +226,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) { // error. TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) { int cc_major, cc_minor; - TF_ASSERT_OK(GetComputeCapability(CudaGpuId(0), &cc_major, &cc_minor)); + TF_ASSERT_OK(GetComputeCapability(PlatformGpuId(0), &cc_major, &cc_minor)); // Exit early while running on Pascal or later GPUs. if (cc_major >= 6) { return; @@ -244,10 +247,10 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) { // more memory than what is available on the device. TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) { static constexpr double kGpuMemoryFraction = 1.2; - static constexpr CudaGpuId kCudaGpuId(0); + static constexpr PlatformGpuId kPlatformGpuId(0); int cc_major, cc_minor; - TF_ASSERT_OK(GetComputeCapability(kCudaGpuId, &cc_major, &cc_minor)); + TF_ASSERT_OK(GetComputeCapability(kPlatformGpuId, &cc_major, &cc_minor)); // Exit early if running on pre-Pascal GPUs. if (cc_major < 6) { LOG(INFO) @@ -262,7 +265,7 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) { ASSERT_EQ(1, devices.size()); int64 memory_limit = devices[0]->attributes().memory_limit(); - ASSERT_EQ(memory_limit, static_cast<int64>(GetTotalGPUMemory(kCudaGpuId) * + ASSERT_EQ(memory_limit, static_cast<int64>(GetTotalGPUMemory(kPlatformGpuId) * kGpuMemoryFraction)); AllocatorAttributes allocator_attributes = AllocatorAttributes(); diff --git a/tensorflow/core/common_runtime/gpu/gpu_id.h b/tensorflow/core/common_runtime/gpu/gpu_id.h index 2a6caea296..f0d9022821 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_id.h +++ b/tensorflow/core/common_runtime/gpu/gpu_id.h @@ -25,10 +25,10 @@ namespace tensorflow { // physical machine, it can be filtered by CUDA environment variable // CUDA_VISIBLE_DEVICES. Note that this id is not visible to Tensorflow, but // result after filtering by CUDA_VISIBLE_DEVICES is visible to TF and is -// called CUDA GPU id as below. See +// called platform GPU id as below. See // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars // for more details. -// - CUDA GPU id (also called *visible* GPU id in +// - *platform* GPU id (also called *visible* GPU id in // third_party/tensorflow/core/protobuf/config.proto): this is the id that is // visible to Tensorflow after filtering by CUDA_VISIBLE_DEVICES, and is // generated by the CUDA GPU driver. It starts from 0 and is used for CUDA API @@ -39,14 +39,14 @@ namespace tensorflow { // field of the device name "/device:GPU:<id>", and is also the identifier of // a BaseGPUDevice. Note that the configuration allows us to create multiple // BaseGPUDevice per GPU hardware in order to use multi CUDA streams on the -// hardware, so the mapping between TF GPU id and CUDA GPU id is not a 1:1 +// hardware, so the mapping between TF GPU id and platform GPU id is not a 1:1 // mapping, see the example below. // // For example, assuming that in the machine we have GPU device with index 0, 1, // 2 and 3 (physical GPU id). Setting "CUDA_VISIBLE_DEVICES=1,2,3" will create -// the following mapping between CUDA GPU id and physical GPU id: +// the following mapping between platform GPU id and physical GPU id: // -// CUDA GPU id -> physical GPU id +// platform GPU id -> physical GPU id // 0 -> 1 // 1 -> 2 // 2 -> 3 @@ -56,32 +56,32 @@ namespace tensorflow { // // Assuming we configure the Session to create one BaseGPUDevice per GPU // hardware, then setting GPUOptions::visible_device_list to "2,0" will create -// the following mappting between TF GPU id and CUDA GPU id: +// the following mappting between TF GPU id and platform GPU id: // -// TF GPU id -> CUDA GPU ID +// TF GPU id -> platform GPU ID // 0 (i.e. /device:GPU:0) -> 2 // 1 (i.e. /device:GPU:1) -> 0 // -// Note that CUDA GPU id 1 is filtered out by GPUOptions::visible_device_list, -// so it won't be used by the TF process. +// Note that platform GPU id 1 is filtered out by +// GPUOptions::visible_device_list, so it won't be used by the TF process. // // On the other hand, if we configure it to create 2 BaseGPUDevice per GPU // hardware, then setting GPUOptions::visible_device_list to "2,0" will create -// the following mappting between TF GPU id and CUDA GPU id: +// the following mappting between TF GPU id and platform GPU id: // -// TF GPU id -> CUDA GPU ID +// TF GPU id -> platform GPU ID // 0 (i.e. /device:GPU:0) -> 2 // 1 (i.e. /device:GPU:1) -> 2 // 2 (i.e. /device:GPU:2) -> 0 // 3 (i.e. /device:GPU:3) -> 0 // -// We create strong-typed integer classes for both TF GPU id and CUDA GPU id to -// minimize programming errors and improve code readability. Except for the +// We create strong-typed integer classes for both TF GPU id and platform GPU id +// to minimize programming errors and improve code readability. Except for the // StreamExecutor interface (as we don't change its API), whenever we need a -// TF GPU id (or CUDA GPU id) we should use TfGpuId (or CudaGpuId) instead of a -// raw integer. +// TF GPU id (or platform GPU id) we should use TfGpuId (or PlatformGpuId) +// instead of a raw integer. TF_LIB_GTL_DEFINE_INT_TYPE(TfGpuId, int32); -TF_LIB_GTL_DEFINE_INT_TYPE(CudaGpuId, int32); +TF_LIB_GTL_DEFINE_INT_TYPE(PlatformGpuId, int32); } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc index b5099dc8ef..2b40730119 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc @@ -26,26 +26,27 @@ limitations under the License. namespace tensorflow { namespace { -// Manages the map between TfGpuId and CUDA GPU id. -class TfToCudaGpuIdMap { +// Manages the map between TfGpuId and platform GPU id. +class TfToPlatformGpuIdMap { public: - static TfToCudaGpuIdMap* singleton() { - static auto* id_map = new TfToCudaGpuIdMap; + static TfToPlatformGpuIdMap* singleton() { + static auto* id_map = new TfToPlatformGpuIdMap; return id_map; } - Status Insert(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id) LOCKS_EXCLUDED(mu_) { + Status Insert(TfGpuId tf_gpu_id, PlatformGpuId platform_gpu_id) + LOCKS_EXCLUDED(mu_) { std::pair<IdMapType::iterator, bool> result; { mutex_lock lock(mu_); - result = id_map_.insert({tf_gpu_id.value(), cuda_gpu_id.value()}); + result = id_map_.insert({tf_gpu_id.value(), platform_gpu_id.value()}); } - if (!result.second && cuda_gpu_id.value() != result.first->second) { + if (!result.second && platform_gpu_id.value() != result.first->second) { return errors::AlreadyExists( "TensorFlow device (GPU:", tf_gpu_id.value(), ") is being mapped to " "multiple CUDA devices (", - cuda_gpu_id.value(), " now, and ", result.first->second, + platform_gpu_id.value(), " now, and ", result.first->second, " previously), which is not supported. " "This may be the result of providing different GPU configurations " "(ConfigProto.gpu_options, for example different visible_device_list)" @@ -56,17 +57,17 @@ class TfToCudaGpuIdMap { return Status::OK(); } - bool Find(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id) const + bool Find(TfGpuId tf_gpu_id, PlatformGpuId* platform_gpu_id) const LOCKS_EXCLUDED(mu_) { mutex_lock lock(mu_); auto result = id_map_.find(tf_gpu_id.value()); if (result == id_map_.end()) return false; - *cuda_gpu_id = result->second; + *platform_gpu_id = result->second; return true; } private: - TfToCudaGpuIdMap() = default; + TfToPlatformGpuIdMap() = default; void TestOnlyReset() LOCKS_EXCLUDED(mu_) { mutex_lock lock(mu_); @@ -78,17 +79,18 @@ class TfToCudaGpuIdMap { IdMapType id_map_ GUARDED_BY(mu_); friend class ::tensorflow::GpuIdManager; - TF_DISALLOW_COPY_AND_ASSIGN(TfToCudaGpuIdMap); + TF_DISALLOW_COPY_AND_ASSIGN(TfToPlatformGpuIdMap); }; } // namespace -Status GpuIdManager::InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id, - CudaGpuId cuda_gpu_id) { - return TfToCudaGpuIdMap::singleton()->Insert(tf_gpu_id, cuda_gpu_id); +Status GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId tf_gpu_id, + PlatformGpuId platform_gpu_id) { + return TfToPlatformGpuIdMap::singleton()->Insert(tf_gpu_id, platform_gpu_id); } -Status GpuIdManager::TfToCudaGpuId(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id) { - if (TfToCudaGpuIdMap::singleton()->Find(tf_gpu_id, cuda_gpu_id)) { +Status GpuIdManager::TfToPlatformGpuId(TfGpuId tf_gpu_id, + PlatformGpuId* platform_gpu_id) { + if (TfToPlatformGpuIdMap::singleton()->Find(tf_gpu_id, platform_gpu_id)) { return Status::OK(); } return errors::NotFound("TensorFlow device GPU:", tf_gpu_id.value(), @@ -96,7 +98,7 @@ Status GpuIdManager::TfToCudaGpuId(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id) { } void GpuIdManager::TestOnlyReset() { - TfToCudaGpuIdMap::singleton()->TestOnlyReset(); + TfToPlatformGpuIdMap::singleton()->TestOnlyReset(); } } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h index 491d92ccdd..62df4310c4 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h +++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h @@ -21,15 +21,17 @@ limitations under the License. namespace tensorflow { -// Class that maintains a map from TfGpuId to CudaGpuId, and manages the +// Class that maintains a map from TfGpuId to PlatformGpuId, and manages the // translation between them. class GpuIdManager { public: - // Adds a mapping from tf_gpu_id to cuda_gpu_id. - static Status InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id); + // Adds a mapping from tf_gpu_id to platform_gpu_id. + static Status InsertTfPlatformGpuIdPair(TfGpuId tf_gpu_id, + PlatformGpuId platform_gpu_id); - // Gets the cuda_gpu_id associated with tf_gpu_id. Returns OK if found. - static Status TfToCudaGpuId(TfGpuId tf_gpu_id, CudaGpuId* cuda_gpu_id); + // Gets the platform_gpu_id associated with tf_gpu_id. Returns OK if found. + static Status TfToPlatformGpuId(TfGpuId tf_gpu_id, + PlatformGpuId* platform_gpu_id); // Clears the map. Used in unit tests only. static void TestOnlyReset(); diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc index a663ec7051..8bf3c6a308 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc @@ -22,38 +22,38 @@ limitations under the License. namespace tensorflow { namespace { -CudaGpuId TfToCudaGpuId(TfGpuId tf) { - CudaGpuId cuda; - TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf, &cuda)); - return cuda; +PlatformGpuId TfToPlatformGpuId(TfGpuId tf) { + PlatformGpuId platform_gpu_id; + TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf, &platform_gpu_id)); + return platform_gpu_id; } TEST(GpuIdManagerTest, Basics) { TfGpuId key_0(0); - CudaGpuId value_0(0); - TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0)); - EXPECT_EQ(value_0, TfToCudaGpuId(key_0)); + PlatformGpuId value_0(0); + TF_ASSERT_OK(GpuIdManager::InsertTfPlatformGpuIdPair(key_0, value_0)); + EXPECT_EQ(value_0, TfToPlatformGpuId(key_0)); // Multiple calls to map the same value is ok. - TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0)); - EXPECT_EQ(value_0, TfToCudaGpuId(key_0)); + TF_ASSERT_OK(GpuIdManager::InsertTfPlatformGpuIdPair(key_0, value_0)); + EXPECT_EQ(value_0, TfToPlatformGpuId(key_0)); // Map a different TfGpuId to a different value. TfGpuId key_1(3); - CudaGpuId value_1(2); - TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_1, value_1)); - EXPECT_EQ(value_1, TfToCudaGpuId(key_1)); + PlatformGpuId value_1(2); + TF_ASSERT_OK(GpuIdManager::InsertTfPlatformGpuIdPair(key_1, value_1)); + EXPECT_EQ(value_1, TfToPlatformGpuId(key_1)); // Mapping a different TfGpuId to the same value is ok. TfGpuId key_2(10); - TF_ASSERT_OK(GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_1)); - EXPECT_EQ(value_1, TfToCudaGpuId(key_2)); + TF_ASSERT_OK(GpuIdManager::InsertTfPlatformGpuIdPair(key_2, value_1)); + EXPECT_EQ(value_1, TfToPlatformGpuId(key_2)); // Mapping the same TfGpuId to a different value. - ASSERT_FALSE(GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_0).ok()); + ASSERT_FALSE(GpuIdManager::InsertTfPlatformGpuIdPair(key_2, value_0).ok()); // Getting a nonexistent mapping. - ASSERT_FALSE(GpuIdManager::TfToCudaGpuId(TfGpuId(100), &value_0).ok()); + ASSERT_FALSE(GpuIdManager::TfToPlatformGpuId(TfGpuId(100), &value_0).ok()); } } // namespace diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h index b9c66b3328..b1f10fb1dc 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h +++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h @@ -24,34 +24,37 @@ limitations under the License. namespace tensorflow { -// Utility methods for translation between Tensorflow GPU ids and CUDA GPU ids. +// Utility methods for translation between Tensorflow GPU ids and platform GPU +// ids. class GpuIdUtil { public: // Convenient methods for getting the associated executor given a TfGpuId or - // CudaGpuId. - static se::port::StatusOr<se::StreamExecutor*> ExecutorForCudaGpuId( - se::Platform* gpu_manager, CudaGpuId cuda_gpu_id) { - return gpu_manager->ExecutorForDevice(cuda_gpu_id.value()); + // PlatformGpuId. + static se::port::StatusOr<se::StreamExecutor*> ExecutorForPlatformGpuId( + se::Platform* gpu_manager, PlatformGpuId platform_gpu_id) { + return gpu_manager->ExecutorForDevice(platform_gpu_id.value()); } - static se::port::StatusOr<se::StreamExecutor*> ExecutorForCudaGpuId( - CudaGpuId cuda_gpu_id) { - return ExecutorForCudaGpuId(GPUMachineManager(), cuda_gpu_id); + static se::port::StatusOr<se::StreamExecutor*> ExecutorForPlatformGpuId( + PlatformGpuId platform_gpu_id) { + return ExecutorForPlatformGpuId(GPUMachineManager(), platform_gpu_id); } static se::port::StatusOr<se::StreamExecutor*> ExecutorForTfGpuId( TfGpuId tf_gpu_id) { - CudaGpuId cuda_gpu_id; - TF_RETURN_IF_ERROR(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); - return ExecutorForCudaGpuId(cuda_gpu_id); + PlatformGpuId platform_gpu_id; + TF_RETURN_IF_ERROR( + GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); + return ExecutorForPlatformGpuId(platform_gpu_id); } - // Verify that the cuda_gpu_id associated with a TfGpuId is legitimate. + // Verify that the platform_gpu_id associated with a TfGpuId is legitimate. static void CheckValidTfGpuId(TfGpuId tf_gpu_id) { - CudaGpuId cuda_gpu_id; - TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); + PlatformGpuId platform_gpu_id; + TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); const int visible_device_count = GPUMachineManager()->VisibleDeviceCount(); - CHECK_LT(cuda_gpu_id.value(), visible_device_count) - << "cuda_gpu_id is outside discovered device range." - << " TF GPU id: " << tf_gpu_id << " CUDA GPU id: " << cuda_gpu_id + CHECK_LT(platform_gpu_id.value(), visible_device_count) + << "platform_gpu_id is outside discovered device range." + << " TF GPU id: " << tf_gpu_id + << " platform GPU id: " << platform_gpu_id << " visible device count: " << visible_device_count; } }; diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc index b18688174d..a5b46382f1 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc @@ -106,22 +106,23 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, return nullptr; } - CudaGpuId cuda_gpu_id; - TF_CHECK_OK(GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id)); + PlatformGpuId platform_gpu_id; + TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id)); gpu_allocator = - new GPUBFCAllocator(cuda_gpu_id, total_bytes, options, + new GPUBFCAllocator(platform_gpu_id, total_bytes, options, strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc")); // If true, checks for memory overwrites by writing // distinctive patterns on both ends of allocated memory. if (useCudaMemoryGuardAllocator()) { - gpu_allocator = new GPUDebugAllocator(gpu_allocator, cuda_gpu_id); - gpu_allocator = new GPUNanResetAllocator(gpu_allocator, cuda_gpu_id); + gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_gpu_id); + gpu_allocator = new GPUNanResetAllocator(gpu_allocator, platform_gpu_id); } else if (useCudaMallocAllocator()) { // If true, passes all allocation requests through to cudaMalloc // useful for doing memory debugging with tools like cuda-memcheck // **WARNING** probably will not work in a multi-gpu scenario - gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, cuda_gpu_id); + gpu_allocator = + new GPUcudaMallocAllocator(gpu_allocator, platform_gpu_id); } gpu_allocators_[tf_gpu_id.value()] = gpu_allocator; @@ -138,7 +139,7 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options, if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) { ProcessState::MemDesc md; md.loc = ProcessState::MemDesc::GPU; - md.dev_index = cuda_gpu_id.value(); + md.dev_index = platform_gpu_id.value(); md.gpu_registered = false; md.nic_registered = true; if (static_cast<int64>(gpu_al_.size()) <= tf_gpu_id.value()) { diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc index b97603c890..e4f6bf7c86 100644 --- a/tensorflow/core/grappler/clusters/single_machine.cc +++ b/tensorflow/core/grappler/clusters/single_machine.cc @@ -93,13 +93,13 @@ Status SingleMachine::Provision() { strings::StrCat("Not able to parse GPU device name: ", dev.name())); } TfGpuId tf_gpu_id(parsed.id); - CudaGpuId cuda_gpu_id; - Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id); + PlatformGpuId platform_gpu_id; + Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id); if (!s.ok()) { return errors::Unavailable("Unknown TF GPU device with id ", tf_gpu_id.value(), ": ", s.ToString()); } - attr = GetLocalGPUInfo(cuda_gpu_id); + attr = GetLocalGPUInfo(platform_gpu_id); } else if (dev.device_type().find("XLA") == string::npos) { // Filter out the fake XLA devices to avoid double counting the actual // hardware resources that are available. diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc index a7519725a5..567e7c075e 100644 --- a/tensorflow/core/grappler/clusters/utils.cc +++ b/tensorflow/core/grappler/clusters/utils.cc @@ -70,13 +70,14 @@ DeviceProperties GetLocalCPUInfo() { return device; } -DeviceProperties GetLocalGPUInfo(CudaGpuId cuda_gpu_id) { +DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) { DeviceProperties device; device.set_type("GPU"); #if GOOGLE_CUDA cudaDeviceProp properties; - cudaError_t error = cudaGetDeviceProperties(&properties, cuda_gpu_id.value()); + cudaError_t error = + cudaGetDeviceProperties(&properties, platform_gpu_id.value()); if (error != cudaSuccess) { device.set_type("UNKNOWN"); LOG(ERROR) << "Failed to get device properties, error code: " << error; @@ -122,15 +123,15 @@ DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) { } else if (device.type == "GPU") { if (device.has_id) { TfGpuId tf_gpu_id(device.id); - CudaGpuId cuda_gpu_id; - Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id); + PlatformGpuId platform_gpu_id; + Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id); if (!s.ok()) { LOG(ERROR) << s; return unknown; } - return GetLocalGPUInfo(cuda_gpu_id); + return GetLocalGPUInfo(platform_gpu_id); } else { - return GetLocalGPUInfo(CudaGpuId(0)); + return GetLocalGPUInfo(PlatformGpuId(0)); } } return unknown; diff --git a/tensorflow/core/grappler/clusters/utils.h b/tensorflow/core/grappler/clusters/utils.h index ca15c48006..f0a342b728 100644 --- a/tensorflow/core/grappler/clusters/utils.h +++ b/tensorflow/core/grappler/clusters/utils.h @@ -28,7 +28,7 @@ DeviceProperties GetLocalCPUInfo(); // Returns the DeviceProperties for the specified GPU attached to the server on // which grappler is running. -DeviceProperties GetLocalGPUInfo(CudaGpuId cuda_gpu_id); +DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id); // Returns the DeviceProperties of the specified device DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device); diff --git a/tensorflow/core/grappler/clusters/utils_test.cc b/tensorflow/core/grappler/clusters/utils_test.cc index 74218adbac..3863d62980 100644 --- a/tensorflow/core/grappler/clusters/utils_test.cc +++ b/tensorflow/core/grappler/clusters/utils_test.cc @@ -31,22 +31,22 @@ TEST(UtilsTest, GetLocalGPUInfo) { LOG(INFO) << "CUDA is enabled."; DeviceProperties properties; - // Invalid CUDA GPU ID. - properties = GetLocalGPUInfo(CudaGpuId(100)); + // Invalid platform GPU ID. + properties = GetLocalGPUInfo(PlatformGpuId(100)); EXPECT_EQ("UNKNOWN", properties.type()); - // Succeed when a valid CUDA GPU id was inserted. - properties = GetLocalGPUInfo(CudaGpuId(0)); + // Succeed when a valid platform GPU id was inserted. + properties = GetLocalGPUInfo(PlatformGpuId(0)); EXPECT_EQ("GPU", properties.type()); EXPECT_EQ("NVIDIA", properties.vendor()); #else LOG(INFO) << "CUDA is not enabled."; DeviceProperties properties; - properties = GetLocalGPUInfo(CudaGpuId(0)); + properties = GetLocalGPUInfo(PlatformGpuId(0)); EXPECT_EQ("GPU", properties.type()); - properties = GetLocalGPUInfo(CudaGpuId(100)); + properties = GetLocalGPUInfo(PlatformGpuId(100)); EXPECT_EQ("GPU", properties.type()); #endif } @@ -74,20 +74,20 @@ TEST(UtilsTest, GetDeviceInfo) { EXPECT_EQ("NVIDIA", properties.vendor()); #endif - // TF to CUDA GPU id mapping entry doesn't exist. + // TF to platform GPU id mapping entry doesn't exist. device.has_id = true; device.id = 0; properties = GetDeviceInfo(device); EXPECT_EQ("UNKNOWN", properties.type()); #if GOOGLE_CUDA - // Invalid CUDA GPU id. - GpuIdManager::InsertTfCudaGpuIdPair(TfGpuId(0), CudaGpuId(100)); + // Invalid platform GPU id. + GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(0), PlatformGpuId(100)); properties = GetDeviceInfo(device); EXPECT_EQ("UNKNOWN", properties.type()); - // Valid CUDA GPU id. - GpuIdManager::InsertTfCudaGpuIdPair(TfGpuId(1), CudaGpuId(0)); + // Valid platform GPU id. + GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(1), PlatformGpuId(0)); device.id = 1; properties = GetDeviceInfo(device); EXPECT_EQ("GPU", properties.type()); diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc index aad00ce039..7691f25327 100644 --- a/tensorflow/core/grappler/costs/utils.cc +++ b/tensorflow/core/grappler/costs/utils.cc @@ -209,13 +209,13 @@ DeviceProperties GetDeviceInfo(const string& device_str) { if (DeviceNameUtils::ParseFullName(device_str, &parsed)) { if (parsed.type == "GPU") { TfGpuId tf_gpu_id(parsed.id); - CudaGpuId cuda_gpu_id; - Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id); + PlatformGpuId platform_gpu_id; + Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id); if (!s.ok()) { // We are probably running simulation without linking cuda libraries. - cuda_gpu_id = CudaGpuId(parsed.id); + platform_gpu_id = PlatformGpuId(parsed.id); } - return GetLocalGPUInfo(cuda_gpu_id); + return GetLocalGPUInfo(platform_gpu_id); } else if (parsed.type == "CPU") { return GetLocalCPUInfo(); } diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto index da3a99565e..c68504a272 100644 --- a/tensorflow/core/protobuf/config.proto +++ b/tensorflow/core/protobuf/config.proto @@ -68,7 +68,7 @@ message GPUOptions { // after the process starts. Users are required to use vendor // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the // physical to visible device mapping prior to invoking TensorFlow. - // 2. In the code, the ids in this list are also called "CUDA GPU id"s, + // 2. In the code, the ids in this list are also called "platform GPU id"s, // and the 'virtual' ids of GPU devices (i.e. the ids in the device // name "/device:GPU:<id>") are also called "TF GPU id"s. Please // refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h |