1 files changed, 58 insertions, 7 deletions
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
index 9f115990c3..d8f97bfbbc 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
@@ -19,12 +19,42 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+namespace tensorrt {
+
+// std::align is not supported, so this method mimic its behavior.
+void* Align(size_t alignment, size_t size, void*& ptr, size_t& space) {
+  QCHECK_GT(alignment, 0) << "alignment must be greater than 0.";
+  QCHECK_EQ(0, alignment & (alignment - 1)) << "Alignment must be power of 2.";
+  QCHECK_GT(size, 0) << "size must be greater than 0.";
+  QCHECK(ptr) << "ptr must not be nullptr.";
+  QCHECK_GT(space, 0) << "space must be greater than 0.";
+  const uintptr_t ptr_val = reinterpret_cast<uintptr_t>(ptr);
+  QCHECK_GE(ptr_val + space, ptr_val) << "Provided space overflows.";
 
+  if (size > space) return nullptr;
+  const uintptr_t aligned_ptr_val = ((ptr_val + alignment - 1) & -alignment);
+  if (aligned_ptr_val > ptr_val + space - size) return nullptr;
+  ptr = reinterpret_cast<void*>(aligned_ptr_val);
+  const uintptr_t diff = aligned_ptr_val - ptr_val;
+  space -= diff;
+  return ptr;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #if NV_TENSORRT_MAJOR > 2
-#include "cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
 namespace tensorrt {
+
 void* TRTCudaAllocator::allocate(uint64_t size, uint64_t alignment,
                                  uint32_t flags) {
   assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
@@ -37,10 +67,23 @@ void TRTCudaAllocator::free(void* memory) { cudaFree(memory); }
 
 void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
                                    uint32_t flags) {
+  // WAR for allocator alignment requirement. Certain cuda API calls require GPU
+  // memory with alignemtn to cudaDeviceProp::textureAlignment.
+  // See issue #20856
+  alignment = 512;
   assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
-  void* mem = allocator_->AllocateRaw(alignment, size);
-  VLOG(2) << "Allocated " << size << " bytes with alignment " << alignment
-          << " @ " << mem;
+  size_t total_size = size + alignment;
+  void* mem = allocator_->AllocateRaw(alignment, total_size);
+  if (!mem) return nullptr;
+
+  void* alloc_mem = mem;
+  QCHECK(Align(alignment, size, mem, total_size));
+  if (mem != alloc_mem) {
+    QCHECK(mem_map_.insert({mem, alloc_mem}).second);
+  }
+  VLOG(2) << "Allocated " << total_size << " bytes memory @" << alloc_mem
+          << "; aligned to " << size << " bytes @" << mem << " with alignment "
+          << alignment;
   return mem;
 }
 
@@ -51,12 +94,20 @@ TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator)
 
 void TRTDeviceAllocator::free(void* memory) {
   VLOG(2) << "Deallocating @ " << memory;
-  allocator_->DeallocateRaw(memory);
+  // allocated memory adjusted for alignment, restore the original pointer
+  if (memory) {
+    auto alloc_mem = mem_map_.find(memory);
+    if (alloc_mem != mem_map_.end()) {
+      memory = alloc_mem->second;
+      mem_map_.erase(alloc_mem->first);
+    }
+    allocator_->DeallocateRaw(memory);
+  }
 }
 
 }  // namespace tensorrt
 }  // namespace tensorflow
 
 #endif
-#endif
-#endif
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA