diff options
Diffstat (limited to 'tensorflow/contrib/tensorrt/resources/trt_allocator.cc')
-rw-r--r-- | tensorflow/contrib/tensorrt/resources/trt_allocator.cc | 65 |
1 files changed, 58 insertions, 7 deletions
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc index 9f115990c3..d8f97bfbbc 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc +++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc @@ -19,12 +19,42 @@ limitations under the License. #if GOOGLE_CUDA #if GOOGLE_TENSORRT +#include "cuda/include/cuda_runtime_api.h" +#endif // GOOGLE_TENSORRT +#endif // GOOGLE_CUDA + +namespace tensorflow { +namespace tensorrt { + +// std::align is not supported, so this method mimic its behavior. +void* Align(size_t alignment, size_t size, void*& ptr, size_t& space) { + QCHECK_GT(alignment, 0) << "alignment must be greater than 0."; + QCHECK_EQ(0, alignment & (alignment - 1)) << "Alignment must be power of 2."; + QCHECK_GT(size, 0) << "size must be greater than 0."; + QCHECK(ptr) << "ptr must not be nullptr."; + QCHECK_GT(space, 0) << "space must be greater than 0."; + const uintptr_t ptr_val = reinterpret_cast<uintptr_t>(ptr); + QCHECK_GE(ptr_val + space, ptr_val) << "Provided space overflows."; + if (size > space) return nullptr; + const uintptr_t aligned_ptr_val = ((ptr_val + alignment - 1) & -alignment); + if (aligned_ptr_val > ptr_val + space - size) return nullptr; + ptr = reinterpret_cast<void*>(aligned_ptr_val); + const uintptr_t diff = aligned_ptr_val - ptr_val; + space -= diff; + return ptr; +} + +} // namespace tensorrt +} // namespace tensorflow + +#if GOOGLE_CUDA +#if GOOGLE_TENSORRT #if NV_TENSORRT_MAJOR > 2 -#include "cuda/include/cuda_runtime_api.h" namespace tensorflow { namespace tensorrt { + void* TRTCudaAllocator::allocate(uint64_t size, uint64_t alignment, uint32_t flags) { assert((alignment & (alignment - 1)) == 0); // zero or a power of 2. @@ -37,10 +67,23 @@ void TRTCudaAllocator::free(void* memory) { cudaFree(memory); } void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment, uint32_t flags) { + // WAR for allocator alignment requirement. Certain cuda API calls require GPU + // memory with alignemtn to cudaDeviceProp::textureAlignment. + // See issue #20856 + alignment = 512; assert((alignment & (alignment - 1)) == 0); // zero or a power of 2. - void* mem = allocator_->AllocateRaw(alignment, size); - VLOG(2) << "Allocated " << size << " bytes with alignment " << alignment - << " @ " << mem; + size_t total_size = size + alignment; + void* mem = allocator_->AllocateRaw(alignment, total_size); + if (!mem) return nullptr; + + void* alloc_mem = mem; + QCHECK(Align(alignment, size, mem, total_size)); + if (mem != alloc_mem) { + QCHECK(mem_map_.insert({mem, alloc_mem}).second); + } + VLOG(2) << "Allocated " << total_size << " bytes memory @" << alloc_mem + << "; aligned to " << size << " bytes @" << mem << " with alignment " + << alignment; return mem; } @@ -51,12 +94,20 @@ TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator) void TRTDeviceAllocator::free(void* memory) { VLOG(2) << "Deallocating @ " << memory; - allocator_->DeallocateRaw(memory); + // allocated memory adjusted for alignment, restore the original pointer + if (memory) { + auto alloc_mem = mem_map_.find(memory); + if (alloc_mem != mem_map_.end()) { + memory = alloc_mem->second; + mem_map_.erase(alloc_mem->first); + } + allocator_->DeallocateRaw(memory); + } } } // namespace tensorrt } // namespace tensorflow #endif -#endif -#endif +#endif // GOOGLE_TENSORRT +#endif // GOOGLE_CUDA |