diff options
author | 2018-06-15 16:21:47 -0700 | |
---|---|---|
committer | 2018-06-15 16:21:47 -0700 | |
commit | e1e56d8f60fcfa70d65579e4b992dac571807e76 (patch) | |
tree | 3bea2dca86e329fd83004034b0df24feeb968ced | |
parent | 99d2d13592a78d2eac5b90fced60a2cd562bed85 (diff) |
Address review comments
-rw-r--r-- | tensorflow/contrib/tensorrt/convert/convert_graph.cc | 165 | ||||
-rw-r--r-- | tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc | 4 |
2 files changed, 87 insertions, 82 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index 37a38d3e1d..20abef6806 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -48,7 +48,9 @@ limitations under the License. #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/protobuf/config.pb.h" #include "tensorflow/core/protobuf/device_properties.pb.h" // NOLINT +#include "tensorflow/core/util/device_name_utils.h" #if GOOGLE_CUDA #if GOOGLE_TENSORRT @@ -614,6 +616,82 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary( return tensorflow::Status::OK(); } +std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator( + ConversionParams& params, EngineInfo& engine) { + int cuda_device_id = -1; + // we need to us PM here since in python path there is no way to get + // to allocators + auto CheckDeviceID = [](int tfid) -> int { + tensorflow::TfGpuId tf_gpu_id(tfid); + CudaGpuId cuda_gpu_id; + Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id); + if (s.ok()) { + VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device " + << cuda_gpu_id.value(); + return cuda_gpu_id.value(); + } + VLOG(2) << "TF GPU with id " << tfid << " do not exist " << s; + return -1; + }; + tensorflow::Allocator* dev_allocator = nullptr; + auto pm = tensorflow::ProcessState::singleton(); + if (params.cluster) { // get allocator + const tensorflow::Device* device = nullptr; + if (params.cluster->GetDeviceSet()) { + device = params.cluster->GetDeviceSet()->FindDeviceByName(engine.device); + } + if (device) { + cuda_device_id = CheckDeviceID(device->parsed_name().id); + if (cuda_device_id < 0) { + LOG(ERROR) << "Cuda device identification failed, using device " + "0."; + cuda_device_id = 0; + } + tensorflow::GPUOptions gpuoptions; + // this should be instantiated by now + tensorflow::TfGpuId tf_gpu_id(device->parsed_name().id); + dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1); + VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value() + << " cuda device= " << cuda_device_id << " at " << dev_allocator; + } + } else { // cluster not found, possibly a python call + int found_device = 0; + bool try_gpu_ids = true; + // if device is set, try to find the device. Might be a problem for multi + // host case but TensorRT do not support multi host setups yet. + if (!engine.device.empty()) { + tensorflow::DeviceNameUtils::ParsedName parsed_name; + if (tensorflow::DeviceNameUtils::ParseFullName(engine.device, + &parsed_name)) { + cuda_device_id = parsed_name.has_id ? parsed_name.id : -1; + } + try_gpu_ids = !parsed_name.has_id; + } + if (try_gpu_ids) { + while (found_device < 100) { + cuda_device_id = CheckDeviceID(found_device); + if (cuda_device_id >= 0) { + break; + } + found_device++; + } + } + if (found_device == 100) { + LOG(ERROR) << " Can't find a GPU device to work with. Please " + "instantiate a session to initialize devices"; + return std::make_pair(cuda_device_id, dev_allocator); + } + LOG(WARNING) + << "Can't determine the device constructing an allocator at device " + << found_device; + tensorflow::GPUOptions gpuoptions; + gpuoptions.set_allow_growth( + true); // this will be a noop if device is already initialized + tensorflow::TfGpuId tf_gpu_id(found_device); + dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1); + } + return std::make_pair(cuda_device_id, dev_allocator); +} // Entry function from optimization pass. tensorflow::Status ConvertAfterShapes(ConversionParams& params) { // Segment the graph into subgraphs that can be converted to TensorRT @@ -694,87 +772,14 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) { segments.at(i).first.size() / total_num_nodes_in_segments) / 2.0; std::shared_ptr<nvinfer1::IGpuAllocator> alloc; + auto device_alloc = GetDeviceAndAllocator(params, engine); int cuda_device_id = 0; - // we need to us PM here since in python path there is no way to get - // to allocators - auto pm = tensorflow::ProcessState::singleton(); - if (params.cluster) { // get allocator - const auto device = - params.cluster->GetDeviceSet()->FindDeviceByName(engine.device); - if (device) { - tensorflow::TfGpuId tf_gpu_id(device->parsed_name().id); - CudaGpuId cuda_gpu_id; - Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id); - if (!s.ok()) { - LOG(ERROR) << "Cuda device identification failed, using device " - "0. Error= " - << s; - cuda_device_id = 0; - } else { - cuda_device_id = cuda_gpu_id.value(); - } - tensorflow::GPUOptions gpuoptions; - // this should be instantiated by now - auto dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1); - VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value() - << " cuda device= " << cuda_device_id << " at " - << dev_allocator; - alloc.reset(new TRTDeviceAllocator(dev_allocator)); - } - } else { - int found_device = 0; - bool try_gpu_ids = true; - auto checkDeviceId = [](int tfid) -> int { - tensorflow::TfGpuId tf_gpu_id(tfid); - CudaGpuId cuda_gpu_id; - Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id); - if (s.ok()) { - VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device " - << cuda_gpu_id.value(); - return cuda_gpu_id.value(); - } - VLOG(2) << "TF GPU with id " << tfid << " do not exist " << s; - return -1; - }; - // if device is set, try to find the device. Might be a problem for multi - // host case but TensorRT do not support multi host setups yet. - if (!engine.device.empty()) { - auto res = str_util::Split(engine.device, ":"); - if (res.size() > 0) { - tensorflow::StringPiece s(res.back()); - tensorflow::str_util::RemoveWhitespaceContext(&s); - uint64 dev_id = 0; - if (str_util::ConsumeLeadingDigits(&s, &dev_id)) { - found_device = dev_id; - cuda_device_id = checkDeviceId(found_device); - if (cuda_device_id >= 0) try_gpu_ids = false; - } - } - } - if (try_gpu_ids) { - while (found_device < 100) { - cuda_device_id = checkDeviceId(found_device); - if (cuda_device_id >= 0) { - break; - } - found_device++; - } - } - if (found_device == 100) { - LOG(ERROR) << " Can't find a GPU device to work with. Please " - "instantiate a session to initialize devices"; - return tensorflow::errors::NotFound( - "Can't find a GPU device to work with"); - } - LOG(WARNING) - << "Can't determine the device constructing an allocator at device " - << found_device; - tensorflow::GPUOptions gpuoptions; - gpuoptions.set_allow_growth( - true); // this will be a noop if device is already initialized - tensorflow::TfGpuId tf_gpu_id(found_device); - auto dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1); - alloc.reset(new TRTDeviceAllocator(dev_allocator)); + if (device_alloc.first >= 0) { + cuda_device_id = device_alloc.first; + alloc.reset(new TRTDeviceAllocator(device_alloc.second)); + } else { // Setting allocator as nullptr should get revert to the + // cudamalloc + LOG(WARNING) << "Can't identify the cuda device. Running on device 0 "; } cudaSetDevice(cuda_device_id); auto status = CreateTRTNode(&graph, engine_segments, i, trt_node, diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc index 6603b0f7c3..2dddc4541c 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc @@ -222,9 +222,9 @@ void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx, StrCat("Unsupported data type encountered in input ", i))); return; } + // Check the allocated buffer is sufficient for input const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx); - CHECK_EQ(t.TotalBytes(), - device_tensor->TotalBytes()); // use the tensor so TF keeps it + CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); input_data.emplace(StrCat(kInputPHName, i), data_address); } VLOG(2) << "Filled map for sending"; |