diff options
author | 2018-07-18 16:04:12 -0700 | |
---|---|---|
committer | 2018-07-18 16:04:12 -0700 | |
commit | 804b14e822f06ade2b52925f42924b1ad5e60790 (patch) | |
tree | 7eee939833f7f9d81365c9df5f852d7b3835e351 /tensorflow/contrib/tensorrt | |
parent | 0e6bb6e3358a741bd995cb9b0055091c6b42a632 (diff) | |
parent | 5a78e98e877bdca794ffd9e5c4f00da5d2e7ee7d (diff) |
Merge branch 'master' of https://github.com/tensorflow/tensorflow into fix_plugin_test
Diffstat (limited to 'tensorflow/contrib/tensorrt')
8 files changed, 53 insertions, 23 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index 089b03dcb5..68c78e8301 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -831,9 +831,7 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) { // The allocator is used to build the engine. The build and the built engine // will be destroyed after we get the serialized engine string, so it's fine // to use unique_ptr here. - // TODO(aaroey): nvinfer1::IGpuAllocator doesn't have a virtual destructor - // and destructing the unique_ptr will result in segfault, fix it. - std::unique_ptr<TRTDeviceAllocator> alloc; + std::unique_ptr<TRTBaseAllocator> alloc; auto device_alloc = GetDeviceAndAllocator(params, engine); int cuda_device_id = 0; if (device_alloc.first >= 0) { diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc index 988b35f74f..2de7973750 100644 --- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc +++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc @@ -65,7 +65,7 @@ class IncPluginTRT : public OpKernel { reinterpret_cast<const cudaStream_t*>(context->op_device_context() ->stream() ->implementation() - ->CudaStreamMemberHack())); + ->GpuStreamMemberHack())); IncrementKernel(input_tensor.flat<float>().data(), inc_, output_tensor->flat<float>().data(), input_shape.num_elements(), *stream); diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc index 04d072f5d9..54009179a8 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc @@ -230,7 +230,7 @@ void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx, reinterpret_cast<const cudaStream_t*>(ctx->op_device_context() ->stream() ->implementation() - ->CudaStreamMemberHack())); + ->GpuStreamMemberHack())); calib_res->calibrator_->setBatch(input_data, *stream); VLOG(2) << "Passed calibration data"; ExecuteNativeSegment(ctx, helper); @@ -391,7 +391,7 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx, reinterpret_cast<const cudaStream_t*>(ctx->op_device_context() ->stream() ->implementation() - ->CudaStreamMemberHack())); + ->GpuStreamMemberHack())); // TODO(jie): trt enqueue does not return error auto& trt_execution_context_ptr = engine_ctx_pair.second; diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h index 6fe318be6a..9265250605 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h @@ -81,7 +81,7 @@ class TRTEngineOp : public AsyncOpKernel { std::vector<string> output_nodes_; // keep device allocator for TRT. - std::unique_ptr<TRTDeviceAllocator> allocator_; + std::unique_ptr<TRTBaseAllocator> allocator_; // serialized protobuf segment or trt engine depending on static_engine_ flag. string serialized_segment_; diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc index 9f115990c3..81d7330b49 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc +++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc @@ -37,8 +37,22 @@ void TRTCudaAllocator::free(void* memory) { cudaFree(memory); } void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment, uint32_t flags) { + // WAR for allocator alignment requirement. Certain cuda API calls require GPU + // memory with alignemtn to cudaDeviceProp::textureAlignment. + // See issue #20856 + alignment = 512; assert((alignment & (alignment - 1)) == 0); // zero or a power of 2. - void* mem = allocator_->AllocateRaw(alignment, size); + size_t total_size = size + alignment; + void* mem = allocator_->AllocateRaw(alignment, total_size); + if (!mem) { + return nullptr; + } + + void* alloc_mem = mem; + CHECK(std::align(alignment, size, mem, total_size)); + if (mem != alloc_mem) { + CHECK(mem_map_.insert({mem, alloc_mem}).second); + } VLOG(2) << "Allocated " << size << " bytes with alignment " << alignment << " @ " << mem; return mem; @@ -51,7 +65,15 @@ TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator) void TRTDeviceAllocator::free(void* memory) { VLOG(2) << "Deallocating @ " << memory; - allocator_->DeallocateRaw(memory); + // allocated memory adjusted for alignment, restore the original pointer + if (memory) { + auto alloc_mem = mem_map_.find(memory); + if (alloc_mem != mem_map_.end()) { + memory = alloc_mem->second; + mem_map_.erase(alloc_mem->first); + } + allocator_->DeallocateRaw(memory); + } } } // namespace tensorrt diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h index 97ac82ca5d..b8825b108d 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h +++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.h @@ -37,7 +37,14 @@ class IGpuAllocator { namespace tensorflow { namespace tensorrt { -class TRTCudaAllocator : public nvinfer1::IGpuAllocator { +class TRTBaseAllocator : public nvinfer1::IGpuAllocator { + // Base allocator class so we can have a virtual destructor; + public: + // python wrapper seems to be not happy with an pure virtual destructor; + virtual ~TRTBaseAllocator() = default; +}; + +class TRTCudaAllocator : public TRTBaseAllocator { // Allocator implementation that is using cuda allocator instead of device // allocator in case we can't get device allocator from TF. public: @@ -47,7 +54,7 @@ class TRTCudaAllocator : public nvinfer1::IGpuAllocator { void free(void* memory) override; }; -class TRTDeviceAllocator : public nvinfer1::IGpuAllocator { +class TRTDeviceAllocator : public TRTBaseAllocator { // Allocator implementation wrapping TF device allocators. public: TRTDeviceAllocator(tensorflow::Allocator* allocator); @@ -62,6 +69,9 @@ class TRTDeviceAllocator : public nvinfer1::IGpuAllocator { private: tensorflow::Allocator* allocator_; + + // supporting alignment from allocation request requires a map to free; + std::unordered_map<void*, void*> mem_map_; }; } // namespace tensorrt diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h index b7d5ffd674..d7d56cb95e 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_resources.h +++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h @@ -64,7 +64,7 @@ class TRTCalibrationResource : public tensorflow::ResourceBase { std::unique_ptr<TRTInt8Calibrator> calibrator_; TrtUniquePtrType<nvinfer1::IBuilder> builder_; TrtUniquePtrType<nvinfer1::ICudaEngine> engine_; - std::unique_ptr<nvinfer1::IGpuAllocator> allocator_; + std::unique_ptr<TRTBaseAllocator> allocator_; tensorflow::tensorrt::Logger logger_; // TODO(sami): Use threadpool threads! std::unique_ptr<std::thread> thr_; diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py index 7c3ef498c9..035b112254 100644 --- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py +++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py @@ -186,8 +186,8 @@ class TfTrtIntegrationTest(test_util.TensorFlowTestCase): # Defaults to 2 runs to verify result across multiple runs is same. for _ in range(num_runs): new_val = sess.run(out, {inp: input_data}) - self.assertEquals(TEST_GRAPHS[graph_key].expected_output_dims, - new_val.shape) + self.assertEqual(TEST_GRAPHS[graph_key].expected_output_dims, + new_val.shape) if val is not None: self.assertAllEqual(new_val, val) val = new_val @@ -220,19 +220,19 @@ class TfTrtIntegrationTest(test_util.TensorFlowTestCase): for n in gdef.node: if n.op == "TRTEngineOp": num_engines += 1 - self.assertNotEqual("", n.attr["serialized_segment"].s) - self.assertNotEqual("", n.attr["segment_funcdef_name"].s) - self.assertEquals(n.attr["precision_mode"].s, precision_mode) - self.assertEquals(n.attr["static_engine"].b, not dynamic_engine) + self.assertNotEqual(to_bytes(""), n.attr["serialized_segment"].s) + self.assertNotEqual(to_bytes(""), n.attr["segment_funcdef_name"].s) + self.assertEqual(n.attr["precision_mode"].s, to_bytes(precision_mode)) + self.assertEqual(n.attr["static_engine"].b, not dynamic_engine) if precision_mode == MODE_INT8 and is_calibrated: - self.assertNotEqual("", n.attr["calibration_data"].s) + self.assertNotEqual(to_bytes(""), n.attr["calibration_data"].s) else: - self.assertEquals("", n.attr["calibration_data"].s) + self.assertEqual(to_bytes(""), n.attr["calibration_data"].s) if precision_mode is None: - self.assertEquals(num_engines, 0) + self.assertEqual(num_engines, 0) else: - self.assertEquals(num_engines, - TEST_GRAPHS[graph_key].num_expected_engines) + self.assertEqual(num_engines, + TEST_GRAPHS[graph_key].num_expected_engines) def _RunTest(self, graph_key, use_optimizer, precision_mode, dynamic_infer_engine, dynamic_calib_engine): |