Merge branch 'master' of https://github.com/tensorflow/tensorflow into fix_plugin_test

author: gracehoney <31743510+aaroey@users.noreply.github.com> 2018-07-18 16:04:12 -0700
committer: gracehoney <31743510+aaroey@users.noreply.github.com> 2018-07-18 16:04:12 -0700
commit: 804b14e822f06ade2b52925f42924b1ad5e60790 (patch)
tree: 7eee939833f7f9d81365c9df5f852d7b3835e351 /tensorflow/contrib/tensorrt
parent: 0e6bb6e3358a741bd995cb9b0055091c6b42a632 (diff)
parent: 5a78e98e877bdca794ffd9e5c4f00da5d2e7ee7d (diff)
8 files changed, 53 insertions, 23 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 089b03dcb5..68c78e8301 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -831,9 +831,7 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     // The allocator is used to build the engine. The build and the built engine
     // will be destroyed after we get the serialized engine string, so it's fine
     // to use unique_ptr here.
-    // TODO(aaroey): nvinfer1::IGpuAllocator doesn't have a virtual destructor
-    // and destructing the unique_ptr will result in segfault, fix it.
-    std::unique_ptr<TRTDeviceAllocator> alloc;
+    std::unique_ptr<TRTBaseAllocator> alloc;
     auto device_alloc = GetDeviceAndAllocator(params, engine);
     int cuda_device_id = 0;
     if (device_alloc.first >= 0) {
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
index 988b35f74f..2de7973750 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
@@ -65,7 +65,7 @@ class IncPluginTRT : public OpKernel {
         reinterpret_cast<const cudaStream_t*>(context->op_device_context()
                                                   ->stream()
                                                   ->implementation()
-                                                  ->CudaStreamMemberHack()));
+                                                  ->GpuStreamMemberHack()));
     IncrementKernel(input_tensor.flat<float>().data(), inc_,
                     output_tensor->flat<float>().data(),
                     input_shape.num_elements(), *stream);
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 04d072f5d9..54009179a8 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -230,7 +230,7 @@ void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
       reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
                                                 ->stream()
                                                 ->implementation()
-                                                ->CudaStreamMemberHack()));
+                                                ->GpuStreamMemberHack()));
   calib_res->calibrator_->setBatch(input_data, *stream);
   VLOG(2) << "Passed calibration data";
   ExecuteNativeSegment(ctx, helper);
@@ -391,7 +391,7 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
       reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
                                                 ->stream()
                                                 ->implementation()
-                                                ->CudaStreamMemberHack()));
+                                                ->GpuStreamMemberHack()));
 
   // TODO(jie): trt enqueue does not return error
   auto& trt_execution_context_ptr = engine_ctx_pair.second;
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 6fe318be6a..9265250605 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -81,7 +81,7 @@ class TRTEngineOp : public AsyncOpKernel {
   std::vector<string> output_nodes_;
 
   // keep device allocator for TRT.
-  std::unique_ptr<TRTDeviceAllocator> allocator_;
+  std::unique_ptr<TRTBaseAllocator> allocator_;
 
   // serialized protobuf segment or trt engine depending on static_engine_ flag.
   string serialized_segment_;
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
index 9f115990c3..81d7330b49 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
@@ -37,8 +37,22 @@ void TRTCudaAllocator::free(void* memory) { cudaFree(memory); }
 
 void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
                                    uint32_t flags) {
+  // WAR for allocator alignment requirement. Certain cuda API calls require GPU
+  // memory with alignemtn to cudaDeviceProp::textureAlignment.
+  // See issue #20856
+  alignment = 512;
   assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
-  void* mem = allocator_->AllocateRaw(alignment, size);
+  size_t total_size = size + alignment;
+  void* mem = allocator_->AllocateRaw(alignment, total_size);
+  if (!mem) {
+    return nullptr;
+  }
+
+  void* alloc_mem = mem;
+  CHECK(std::align(alignment, size, mem, total_size));
+  if (mem != alloc_mem) {
+    CHECK(mem_map_.insert({mem, alloc_mem}).second);
+  }
   VLOG(2) << "Allocated " << size << " bytes with alignment " << alignment
           << " @ " << mem;
   return mem;
@@ -51,7 +65,15 @@ TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator)
 
 void TRTDeviceAllocator::free(void* memory) {
   VLOG(2) << "Deallocating @ " << memory;
-  allocator_->DeallocateRaw(memory);
+  // allocated memory adjusted for alignment, restore the original pointer
+  if (memory) {
+    auto alloc_mem = mem_map_.find(memory);
+    if (alloc_mem != mem_map_.end()) {
+      memory = alloc_mem->second;
+      mem_map_.erase(alloc_mem->first);
+    }
+    allocator_->DeallocateRaw(memory);
+  }
 }
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
index 97ac82ca5d..b8825b108d 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
@@ -37,7 +37,14 @@ class IGpuAllocator {
 namespace tensorflow {
 namespace tensorrt {
 
-class TRTCudaAllocator : public nvinfer1::IGpuAllocator {
+class TRTBaseAllocator : public nvinfer1::IGpuAllocator {
+  // Base allocator class so we can have a virtual destructor;
+ public:
+  // python wrapper seems to be not happy with an pure virtual destructor;
+  virtual ~TRTBaseAllocator() = default;
+};
+
+class TRTCudaAllocator : public TRTBaseAllocator {
   // Allocator implementation that is using cuda allocator instead of device
   // allocator in case we can't get device allocator from TF.
  public:
@@ -47,7 +54,7 @@ class TRTCudaAllocator : public nvinfer1::IGpuAllocator {
   void free(void* memory) override;
 };
 
-class TRTDeviceAllocator : public nvinfer1::IGpuAllocator {
+class TRTDeviceAllocator : public TRTBaseAllocator {
   // Allocator implementation wrapping TF device allocators.
  public:
   TRTDeviceAllocator(tensorflow::Allocator* allocator);
@@ -62,6 +69,9 @@ class TRTDeviceAllocator : public nvinfer1::IGpuAllocator {
 
  private:
   tensorflow::Allocator* allocator_;
+
+  // supporting alignment from allocation request requires a map to free;
+  std::unordered_map<void*, void*> mem_map_;
 };
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index b7d5ffd674..d7d56cb95e 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -64,7 +64,7 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
   std::unique_ptr<TRTInt8Calibrator> calibrator_;
   TrtUniquePtrType<nvinfer1::IBuilder> builder_;
   TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
-  std::unique_ptr<nvinfer1::IGpuAllocator> allocator_;
+  std::unique_ptr<TRTBaseAllocator> allocator_;
   tensorflow::tensorrt::Logger logger_;
   // TODO(sami): Use threadpool threads!
   std::unique_ptr<std::thread> thr_;
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
index 7c3ef498c9..035b112254 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
@@ -186,8 +186,8 @@ class TfTrtIntegrationTest(test_util.TensorFlowTestCase):
       # Defaults to 2 runs to verify result across multiple runs is same.
       for _ in range(num_runs):
         new_val = sess.run(out, {inp: input_data})
-        self.assertEquals(TEST_GRAPHS[graph_key].expected_output_dims,
-                          new_val.shape)
+        self.assertEqual(TEST_GRAPHS[graph_key].expected_output_dims,
+                         new_val.shape)
         if val is not None:
           self.assertAllEqual(new_val, val)
         val = new_val
@@ -220,19 +220,19 @@ class TfTrtIntegrationTest(test_util.TensorFlowTestCase):
     for n in gdef.node:
       if n.op == "TRTEngineOp":
         num_engines += 1
-        self.assertNotEqual("", n.attr["serialized_segment"].s)
-        self.assertNotEqual("", n.attr["segment_funcdef_name"].s)
-        self.assertEquals(n.attr["precision_mode"].s, precision_mode)
-        self.assertEquals(n.attr["static_engine"].b, not dynamic_engine)
+        self.assertNotEqual(to_bytes(""), n.attr["serialized_segment"].s)
+        self.assertNotEqual(to_bytes(""), n.attr["segment_funcdef_name"].s)
+        self.assertEqual(n.attr["precision_mode"].s, to_bytes(precision_mode))
+        self.assertEqual(n.attr["static_engine"].b, not dynamic_engine)
         if precision_mode == MODE_INT8 and is_calibrated:
-          self.assertNotEqual("", n.attr["calibration_data"].s)
+          self.assertNotEqual(to_bytes(""), n.attr["calibration_data"].s)
         else:
-          self.assertEquals("", n.attr["calibration_data"].s)
+          self.assertEqual(to_bytes(""), n.attr["calibration_data"].s)
     if precision_mode is None:
-      self.assertEquals(num_engines, 0)
+      self.assertEqual(num_engines, 0)
     else:
-      self.assertEquals(num_engines,
-                        TEST_GRAPHS[graph_key].num_expected_engines)
+      self.assertEqual(num_engines,
+                       TEST_GRAPHS[graph_key].num_expected_engines)
 
   def _RunTest(self, graph_key, use_optimizer, precision_mode,
                dynamic_infer_engine, dynamic_calib_engine):
author	gracehoney <31743510+aaroey@users.noreply.github.com>	2018-07-18 16:04:12 -0700
committer	gracehoney <31743510+aaroey@users.noreply.github.com>	2018-07-18 16:04:12 -0700
commit	804b14e822f06ade2b52925f42924b1ad5e60790 (patch)
tree	7eee939833f7f9d81365c9df5f852d7b3835e351 /tensorflow/contrib/tensorrt
parent	0e6bb6e3358a741bd995cb9b0055091c6b42a632 (diff)
parent	5a78e98e877bdca794ffd9e5c4f00da5d2e7ee7d (diff)