aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/tensorrt
diff options
context:
space:
mode:
authorGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-07-18 16:00:32 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-07-18 16:00:39 -0700
commit5a78e98e877bdca794ffd9e5c4f00da5d2e7ee7d (patch)
tree7931d1ef5a0d02f0a62431fc45175bbdd21c0ffc /tensorflow/contrib/tensorrt
parent06a9805c336242ceded4907da9159bf518e6623a (diff)
parentfbfc8db63ca6bbabcede9dcb8b2bd8989ebebcd9 (diff)
Merge pull request #20862 from jjsjann123:alignment_pr
PiperOrigin-RevId: 205152344
Diffstat (limited to 'tensorflow/contrib/tensorrt')
-rw-r--r--tensorflow/contrib/tensorrt/convert/convert_graph.cc4
-rw-r--r--tensorflow/contrib/tensorrt/kernels/trt_engine_op.h2
-rw-r--r--tensorflow/contrib/tensorrt/resources/trt_allocator.cc26
-rw-r--r--tensorflow/contrib/tensorrt/resources/trt_allocator.h14
-rw-r--r--tensorflow/contrib/tensorrt/resources/trt_resources.h2
5 files changed, 39 insertions, 9 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 089b03dcb5..68c78e8301 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -831,9 +831,7 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
// The allocator is used to build the engine. The build and the built engine
// will be destroyed after we get the serialized engine string, so it's fine
// to use unique_ptr here.
- // TODO(aaroey): nvinfer1::IGpuAllocator doesn't have a virtual destructor
- // and destructing the unique_ptr will result in segfault, fix it.
- std::unique_ptr<TRTDeviceAllocator> alloc;
+ std::unique_ptr<TRTBaseAllocator> alloc;
auto device_alloc = GetDeviceAndAllocator(params, engine);
int cuda_device_id = 0;
if (device_alloc.first >= 0) {
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 6fe318be6a..9265250605 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -81,7 +81,7 @@ class TRTEngineOp : public AsyncOpKernel {
std::vector<string> output_nodes_;
// keep device allocator for TRT.
- std::unique_ptr<TRTDeviceAllocator> allocator_;
+ std::unique_ptr<TRTBaseAllocator> allocator_;
// serialized protobuf segment or trt engine depending on static_engine_ flag.
string serialized_segment_;
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
index 9f115990c3..81d7330b49 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
@@ -37,8 +37,22 @@ void TRTCudaAllocator::free(void* memory) { cudaFree(memory); }
void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
uint32_t flags) {
+ // WAR for allocator alignment requirement. Certain cuda API calls require GPU
+ // memory with alignemtn to cudaDeviceProp::textureAlignment.
+ // See issue #20856
+ alignment = 512;
assert((alignment & (alignment - 1)) == 0); // zero or a power of 2.
- void* mem = allocator_->AllocateRaw(alignment, size);
+ size_t total_size = size + alignment;
+ void* mem = allocator_->AllocateRaw(alignment, total_size);
+ if (!mem) {
+ return nullptr;
+ }
+
+ void* alloc_mem = mem;
+ CHECK(std::align(alignment, size, mem, total_size));
+ if (mem != alloc_mem) {
+ CHECK(mem_map_.insert({mem, alloc_mem}).second);
+ }
VLOG(2) << "Allocated " << size << " bytes with alignment " << alignment
<< " @ " << mem;
return mem;
@@ -51,7 +65,15 @@ TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator)
void TRTDeviceAllocator::free(void* memory) {
VLOG(2) << "Deallocating @ " << memory;
- allocator_->DeallocateRaw(memory);
+ // allocated memory adjusted for alignment, restore the original pointer
+ if (memory) {
+ auto alloc_mem = mem_map_.find(memory);
+ if (alloc_mem != mem_map_.end()) {
+ memory = alloc_mem->second;
+ mem_map_.erase(alloc_mem->first);
+ }
+ allocator_->DeallocateRaw(memory);
+ }
}
} // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
index 97ac82ca5d..b8825b108d 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
@@ -37,7 +37,14 @@ class IGpuAllocator {
namespace tensorflow {
namespace tensorrt {
-class TRTCudaAllocator : public nvinfer1::IGpuAllocator {
+class TRTBaseAllocator : public nvinfer1::IGpuAllocator {
+ // Base allocator class so we can have a virtual destructor;
+ public:
+ // python wrapper seems to be not happy with an pure virtual destructor;
+ virtual ~TRTBaseAllocator() = default;
+};
+
+class TRTCudaAllocator : public TRTBaseAllocator {
// Allocator implementation that is using cuda allocator instead of device
// allocator in case we can't get device allocator from TF.
public:
@@ -47,7 +54,7 @@ class TRTCudaAllocator : public nvinfer1::IGpuAllocator {
void free(void* memory) override;
};
-class TRTDeviceAllocator : public nvinfer1::IGpuAllocator {
+class TRTDeviceAllocator : public TRTBaseAllocator {
// Allocator implementation wrapping TF device allocators.
public:
TRTDeviceAllocator(tensorflow::Allocator* allocator);
@@ -62,6 +69,9 @@ class TRTDeviceAllocator : public nvinfer1::IGpuAllocator {
private:
tensorflow::Allocator* allocator_;
+
+ // supporting alignment from allocation request requires a map to free;
+ std::unordered_map<void*, void*> mem_map_;
};
} // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index b7d5ffd674..d7d56cb95e 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -64,7 +64,7 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
std::unique_ptr<TRTInt8Calibrator> calibrator_;
TrtUniquePtrType<nvinfer1::IBuilder> builder_;
TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
- std::unique_ptr<nvinfer1::IGpuAllocator> allocator_;
+ std::unique_ptr<TRTBaseAllocator> allocator_;
tensorflow::tensorrt::Logger logger_;
// TODO(sami): Use threadpool threads!
std::unique_ptr<std::thread> thr_;