/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h" #include #include #include "tensorflow/core/common_runtime/gpu/gpu_id.h" #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h" #include "tensorflow/core/common_runtime/gpu/gpu_init.h" #include "tensorflow/core/platform/stream_executor.h" #define MASK_WORDS 2 #define MASK_BYTES (MASK_WORDS * sizeof(int64)) namespace tensorflow { namespace { int64* NewMask(int64 word) { int64* m = new int64[MASK_WORDS]; for (int i = 0; i < MASK_WORDS; ++i) { m[i] = word; } return m; } int64* before_mask = NewMask(0xabababababababab); int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd); bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) { se::DeviceMemory gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}}; int64 tmp[MASK_WORDS]; if (!exec->SynchronousMemcpy(&tmp, gpu_ptr, MASK_BYTES)) { LOG(FATAL) << "Could not copy debug mask"; } bool ok = true; for (int i = 0; i < MASK_WORDS; ++i) { ok &= (mask[i] == tmp[i]); if (!ok) { LOG(ERROR) << "i=" << i << " mask=" << reinterpret_cast(mask[i]) << " field=" << reinterpret_cast(tmp[i]); } } return ok; } void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) { se::DeviceMemory gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}}; if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) { LOG(FATAL) << "Could not copy debug mask"; } } } // namespace // ----------------------------------------------------------------------------- // GPUDebugAllocator // ----------------------------------------------------------------------------- GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator, PlatformGpuId platform_gpu_id) : base_allocator_(allocator) { stream_exec_ = GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); } GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; } void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { num_bytes += (2 * MASK_BYTES); void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes); if (allocated_ptr == nullptr) return allocated_ptr; // Return the pointer after the header void* rv = static_cast(allocated_ptr) + MASK_BYTES; // Write the header at allocated_ptr InitMask(stream_exec_, allocated_ptr, before_mask); // Write the footer at the end. size_t req_size = base_allocator_->RequestedSize(allocated_ptr); InitMask(stream_exec_, static_cast(allocated_ptr) + req_size - MASK_BYTES, after_mask); return rv; } void GPUDebugAllocator::DeallocateRaw(void* ptr) { if (ptr != nullptr) { CHECK(CheckHeader(ptr)) << "before_mask has been overwritten"; CHECK(CheckFooter(ptr)) << "after_mask has been overwritten"; // Backtrack to the beginning of the header. ptr = static_cast(static_cast(ptr) - MASK_BYTES); } // Deallocate the memory base_allocator_->DeallocateRaw(ptr); } bool GPUDebugAllocator::TracksAllocationSizes() { return true; } size_t GPUDebugAllocator::RequestedSize(const void* ptr) { auto req_size = base_allocator_->RequestedSize(static_cast(ptr) - MASK_BYTES); return req_size - 2 * MASK_BYTES; } size_t GPUDebugAllocator::AllocatedSize(const void* ptr) { return base_allocator_->AllocatedSize(static_cast(ptr) - MASK_BYTES); } int64 GPUDebugAllocator::AllocationId(const void* ptr) { return base_allocator_->AllocationId(static_cast(ptr) - MASK_BYTES); } void GPUDebugAllocator::GetStats(AllocatorStats* stats) { base_allocator_->GetStats(stats); } void GPUDebugAllocator::ClearStats() { base_allocator_->ClearStats(); } bool GPUDebugAllocator::CheckHeader(void* ptr) { return CheckMask(stream_exec_, static_cast(ptr) - MASK_BYTES, before_mask); } bool GPUDebugAllocator::CheckFooter(void* ptr) { char* original_ptr = static_cast(ptr) - MASK_BYTES; size_t req_size = base_allocator_->RequestedSize(original_ptr); return CheckMask(stream_exec_, original_ptr + req_size - MASK_BYTES, after_mask); } // ----------------------------------------------------------------------------- // GPUNanResetAllocator // ----------------------------------------------------------------------------- GPUNanResetAllocator::GPUNanResetAllocator(Allocator* allocator, PlatformGpuId platform_gpu_id) : base_allocator_(allocator) { stream_exec_ = GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(); } GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; } void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes); if (allocated_ptr == nullptr) return allocated_ptr; // Initialize the buffer to Nans size_t req_size = base_allocator_->RequestedSize(allocated_ptr); std::vector nans((req_size + sizeof(float) - 1) / sizeof(float), std::nanf("")); se::DeviceMemory nan_ptr{ se::DeviceMemoryBase{static_cast(allocated_ptr), req_size}}; if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) { LOG(ERROR) << "Could not initialize to NaNs"; } return allocated_ptr; } void GPUNanResetAllocator::DeallocateRaw(void* ptr) { if (ptr != nullptr) { // Reset the buffer to Nans size_t req_size = base_allocator_->RequestedSize(ptr); std::vector nans((req_size + sizeof(float) - 1) / sizeof(float), std::nanf("")); se::DeviceMemory nan_ptr{ se::DeviceMemoryBase{static_cast(ptr), req_size}}; if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) { LOG(ERROR) << "Could not initialize to NaNs"; } } // Deallocate the memory base_allocator_->DeallocateRaw(ptr); } size_t GPUNanResetAllocator::RequestedSize(const void* ptr) { return base_allocator_->RequestedSize(ptr); } size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) { return base_allocator_->AllocatedSize(ptr); } void GPUNanResetAllocator::GetStats(AllocatorStats* stats) { base_allocator_->GetStats(stats); } void GPUNanResetAllocator::ClearStats() { base_allocator_->ClearStats(); } } // namespace tensorflow