aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/common_runtime/gpu
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/common_runtime/gpu')
-rw-r--r--tensorflow/core/common_runtime/gpu/dma_helper.h18
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_allocator_retry.cc49
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h36
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc175
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc397
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h156
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc166
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc186
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h68
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc207
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.cc651
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device.h94
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_device_factory.cc52
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc132
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_event_mgr.h118
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc152
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_init.cc147
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_init.h19
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc371
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_region_allocator.h146
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_region_allocator_test.cc71
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_stream_util.cc97
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_stream_util.h30
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc137
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_util.cc345
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_util.h89
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc24
-rw-r--r--tensorflow/core/common_runtime/gpu/pool_allocator.cc269
-rw-r--r--tensorflow/core/common_runtime/gpu/pool_allocator.h202
-rw-r--r--tensorflow/core/common_runtime/gpu/pool_allocator_test.cc203
-rw-r--r--tensorflow/core/common_runtime/gpu/process_state.cc220
-rw-r--r--tensorflow/core/common_runtime/gpu/process_state.h140
-rw-r--r--tensorflow/core/common_runtime/gpu/visitable_allocator.h30
33 files changed, 5197 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/dma_helper.h b/tensorflow/core/common_runtime/gpu/dma_helper.h
new file mode 100644
index 0000000000..7b0750f405
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/dma_helper.h
@@ -0,0 +1,18 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
+
+#include "tensorflow/core/public/tensor.h"
+
+// For internal use only. Visibility should be limited to brain/framework.
+
+namespace tensorflow {
+class DMAHelper {
+ public:
+ static bool CanUseDMA(const Tensor* t) { return t->CanUseDMA(); }
+ static const void* base(const Tensor* t) { return t->base<const void>(); }
+ static void* base(Tensor* t) { return t->base<void>(); }
+ static TensorBuffer* buffer(Tensor* t) { return t->buf_; }
+ static const TensorBuffer* buffer(const Tensor* t) { return t->buf_; }
+};
+} // namespace tensorflow
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.cc
new file mode 100644
index 0000000000..742459c63b
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.cc
@@ -0,0 +1,49 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
+#include "tensorflow/core/public/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace tensorflow {
+
+GPUAllocatorRetry::GPUAllocatorRetry() : env_(Env::Default()) {}
+
+void* GPUAllocatorRetry::AllocateRaw(
+ std::function<void*(size_t alignment, size_t num_bytes,
+ bool verbose_failure)> alloc_func,
+ int max_millis_to_wait, size_t alignment, size_t num_bytes) {
+ if (num_bytes == 0) {
+ LOG(WARNING) << "Request to allocate 0 bytes";
+ return nullptr;
+ }
+ uint64 deadline_micros = env_->NowMicros() + max_millis_to_wait * 1000;
+ void* ptr = nullptr;
+ while (ptr == nullptr) {
+ ptr = alloc_func(alignment, num_bytes, false);
+ if (ptr == nullptr) {
+ uint64 now = env_->NowMicros();
+ if (now < deadline_micros) {
+ mutex_lock l(mu_);
+ WaitForMilliseconds(&l, &memory_returned_,
+ (deadline_micros - now) / 1000);
+ } else {
+ return alloc_func(alignment, num_bytes, true);
+ }
+ }
+ }
+ return ptr;
+}
+
+void GPUAllocatorRetry::DeallocateRaw(std::function<void(void*)> dealloc_func,
+ void* ptr) {
+ if (ptr == nullptr) {
+ LOG(ERROR) << "Request to free nullptr";
+ return;
+ }
+ dealloc_func(ptr);
+ {
+ mutex_lock l(mu_);
+ memory_returned_.notify_all();
+ }
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h
new file mode 100644
index 0000000000..a3298ab222
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h
@@ -0,0 +1,36 @@
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/env.h"
+
+namespace tensorflow {
+
+// A retrying wrapper for a memory allocator.
+class GPUAllocatorRetry {
+ public:
+ GPUAllocatorRetry();
+
+ // Call 'alloc_func' to obtain memory. On first call,
+ // 'verbose_failure' will be false. If return value is nullptr,
+ // then wait up to 'max_millis_to_wait' milliseconds, retrying each
+ // time a call to DeallocateRaw() is detected, until either a good
+ // pointer is returned or the deadline is exhausted. If the
+ // deadline is exahusted, try one more time with 'verbose_failure'
+ // set to true. The value returned is either the first good pointer
+ // obtained from 'alloc_func' or nullptr.
+ void* AllocateRaw(std::function<void*(size_t alignment, size_t num_bytes,
+ bool verbose_failure)> alloc_func,
+ int max_millis_to_wait, size_t alignment, size_t bytes);
+
+ // Calls dealloc_func(ptr) and then notifies any threads blocked in
+ // AllocateRaw() that would like to retry.
+ void DeallocateRaw(std::function<void(void* ptr)> dealloc_func, void* ptr);
+
+ private:
+ Env* env_;
+ mutex mu_;
+ condition_variable memory_returned_;
+};
+} // namespace tensorflow
+#endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
new file mode 100644
index 0000000000..db1c58cc65
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
@@ -0,0 +1,175 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/env.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+class FakeAllocator {
+ public:
+ FakeAllocator(size_t cap, int millis_to_wait)
+ : memory_capacity_(cap), millis_to_wait_(millis_to_wait) {}
+
+ // Allocate just keeps track of the number of outstanding allocations,
+ // not their sizes. Assume a constant size for each.
+ void* AllocateRaw(size_t alignment, size_t num_bytes) {
+ return retry_.AllocateRaw(
+ [this](size_t a, size_t nb, bool v) {
+ mutex_lock l(mu_);
+ if (memory_capacity_ > 0) {
+ --memory_capacity_;
+ return good_ptr_;
+ } else {
+ return static_cast<void*>(nullptr);
+ }
+ },
+ millis_to_wait_, alignment, num_bytes);
+ }
+
+ void DeallocateRaw(void* ptr) {
+ retry_.DeallocateRaw(
+ [this](void* p) {
+ mutex_lock l(mu_);
+ ++memory_capacity_;
+ },
+ ptr);
+ }
+
+ private:
+ GPUAllocatorRetry retry_;
+ void* good_ptr_ = reinterpret_cast<void*>(0xdeadbeef);
+ mutex mu_;
+ size_t memory_capacity_ GUARDED_BY(mu_);
+ int millis_to_wait_;
+};
+
+class GPUAllocatorRetryTest : public ::testing::Test {
+ protected:
+ GPUAllocatorRetryTest() {}
+
+ void LaunchConsumerThreads(int num_consumers, int cap_needed) {
+ consumer_count_.resize(num_consumers, 0);
+ for (int i = 0; i < num_consumers; ++i) {
+ consumers_.push_back(Env::Default()->StartThread(
+ ThreadOptions(), "anon_thread", [this, i, cap_needed]() {
+ do {
+ void* ptr = nullptr;
+ for (int j = 0; j < cap_needed; ++j) {
+ ptr = alloc_->AllocateRaw(16, 1);
+ if (ptr == nullptr) {
+ mutex_lock l(mu_);
+ has_failed_ = true;
+ return;
+ }
+ }
+ ++consumer_count_[i];
+ for (int j = 0; j < cap_needed; ++j) {
+ alloc_->DeallocateRaw(ptr);
+ }
+ } while (!notifier_.HasBeenNotified());
+ }));
+ }
+ }
+
+ // Wait up to wait_micros microseconds for has_failed_ to equal expected,
+ // then terminate all threads.
+ void JoinConsumerThreads(bool expected, int wait_micros) {
+ while (wait_micros > 0) {
+ {
+ mutex_lock l(mu_);
+ if (has_failed_ == expected) break;
+ }
+ int interval_micros = std::min(1000, wait_micros);
+ Env::Default()->SleepForMicroseconds(interval_micros);
+ wait_micros -= interval_micros;
+ }
+ notifier_.Notify();
+ for (auto c : consumers_) {
+ // Blocks until thread terminates.
+ delete c;
+ }
+ }
+
+ std::unique_ptr<FakeAllocator> alloc_;
+ std::vector<Thread*> consumers_;
+ std::vector<int> consumer_count_;
+ Notification notifier_;
+ mutex mu_;
+ bool has_failed_ GUARDED_BY(mu_) = false;
+ int count_ GUARDED_BY(mu_) = 0;
+};
+
+// Verifies correct retrying when memory is slightly overcommitted but
+// we allow retry.
+TEST_F(GPUAllocatorRetryTest, RetrySuccess) {
+ // Support up to 2 allocations simultaneously, waits up to 10 msec for
+ // a chance to alloc.
+ alloc_.reset(new FakeAllocator(2, 10000));
+ // Launch 3 consumers, each of whom needs 1 unit at a time.
+ LaunchConsumerThreads(3, 1);
+ // This should be enough time for each consumer to be satisfied many times.
+ Env::Default()->SleepForMicroseconds(50000);
+ JoinConsumerThreads(false, 0);
+ for (int i = 0; i < 3; ++i) {
+ LOG(INFO) << "Consumer " << i << " is " << consumer_count_[i];
+ }
+ {
+ mutex_lock l(mu_);
+ EXPECT_FALSE(has_failed_);
+ }
+ EXPECT_GT(consumer_count_[0], 0);
+ EXPECT_GT(consumer_count_[1], 0);
+ EXPECT_GT(consumer_count_[2], 0);
+}
+
+// Verifies OutOfMemory failure when memory is slightly overcommitted
+// and retry is not allowed.
+TEST_F(GPUAllocatorRetryTest, NoRetryFail) {
+ // Support up to 2 allocations simultaneously, waits up to 0 msec for
+ // a chance to alloc.
+ alloc_.reset(new FakeAllocator(2, 0));
+ // Launch 3 consumers, each of whom needs 1 unit at a time.
+ LaunchConsumerThreads(3, 1);
+ Env::Default()->SleepForMicroseconds(50000);
+ // Will wait up to 10 seconds for proper race condition to occur, resulting
+ // in failure.
+ JoinConsumerThreads(true, 10000000);
+ for (int i = 0; i < 3; ++i) {
+ LOG(INFO) << "Consumer " << i << " is " << consumer_count_[i];
+ }
+ {
+ mutex_lock l(mu_);
+ EXPECT_TRUE(has_failed_);
+ }
+}
+
+// Verifies OutOfMemory failure when retry is allowed but memory capacity
+// is too low even for retry.
+TEST_F(GPUAllocatorRetryTest, RetryInsufficientFail) {
+ // Support up to 2 allocations simultaneously, waits up to 10 msec for
+ // a chance to alloc.
+ alloc_.reset(new FakeAllocator(2, 10000));
+ // Launch 3 consumers, each of whom needs 2 units at a time. We expect
+ // deadlock where 2 consumers each hold 1 unit, and timeout trying to
+ // get the second.
+ LaunchConsumerThreads(3, 2);
+ Env::Default()->SleepForMicroseconds(50000);
+ // Will wait up to 10 seconds for proper race condition to occur, resulting
+ // in failure.
+ JoinConsumerThreads(true, 10000000);
+ for (int i = 0; i < 3; ++i) {
+ LOG(INFO) << "Consumer " << i << " is " << consumer_count_[i];
+ }
+ {
+ mutex_lock l(mu_);
+ EXPECT_TRUE(has_failed_);
+ }
+}
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
new file mode 100644
index 0000000000..3df833594f
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -0,0 +1,397 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
+
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory)
+ : device_id_(device_id) {
+ // Get a pointer to the stream_executor for this device
+ stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+
+ // Allocate the requested amount of memory.
+ gpu_memory_size_ = total_memory;
+
+ LOG(INFO) << "Allocating " << strings::HumanReadableNumBytes(gpu_memory_size_)
+ << " bytes.";
+ gpu::DeviceMemory<char> gpu_mem =
+ stream_exec_->AllocateArray<char>(gpu_memory_size_);
+
+ QCHECK(gpu_mem != nullptr)
+ << " Could not allocate GPU device memory for device " << device_id
+ << ". Tried to allocate "
+ << strings::HumanReadableNumBytes(gpu_memory_size_);
+ base_ptr_ = gpu_mem.opaque();
+ LOG(INFO) << "GPU " << device_id << " memory begins at " << base_ptr_
+ << " extends to "
+ << static_cast<void*>(
+ (static_cast<char*>(base_ptr_) + gpu_memory_size_));
+
+ // Create a bunch of bins of various good sizes.
+
+ // Covers allocations of exactly 256 bytes (the minimum size).
+ bins_.insert(std::make_pair(256, new Bin(256)));
+
+ // We create bins to fit all possible ranges that cover the
+ // gpu_memory_size_ starting from allocations up to 1024 bytes to
+ // allocations up to (and including) the memory limit.
+ for (size_t bin_size = 1024; bin_size < gpu_memory_size_ * 2; bin_size *= 2) {
+ LOG(INFO) << "Creating bin of max chunk size "
+ << strings::HumanReadableNumBytes(bin_size);
+ bins_.insert(std::make_pair(bin_size, new Bin(bin_size)));
+ }
+
+ // Create one large chunk for the whole memory space that will
+ // be chunked later.
+ GPUBFCAllocator::Chunk* c = new GPUBFCAllocator::Chunk();
+ c->ptr = gpu_mem.opaque();
+ c->size = gpu_memory_size_;
+ c->in_use = false;
+ c->prev = nullptr;
+ c->next = nullptr;
+
+ ptr_to_chunk_map_.insert(std::make_pair(c->ptr, c));
+
+ // Insert the chunk into the right bin.
+ ReassignChunkToBin(c);
+}
+
+GPUBFCAllocator::~GPUBFCAllocator() {
+ // Return memory back.
+ if (base_ptr_) {
+ gpu::DeviceMemoryBase gpu_ptr{base_ptr_};
+ stream_exec_->Deallocate(&gpu_ptr);
+ }
+
+ gtl::STLDeleteValues(&bins_);
+}
+
+void* GPUBFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) {
+ static const int64 kMaxMillisToWait = 10000; // 10 seconds
+ return retry_helper_.AllocateRaw(
+ [this](size_t a, size_t nb, bool v) {
+ return AllocateRawInternal(a, nb, v);
+ },
+ kMaxMillisToWait, unused_alignment, num_bytes);
+}
+
+void* GPUBFCAllocator::AllocateRawInternal(size_t unused_alignment,
+ size_t num_bytes,
+ bool dump_log_on_failure) {
+ if (num_bytes == 0) {
+ LOG(ERROR) << "tried to allocate 0 bytes";
+ return nullptr;
+ }
+ // First, always allocate memory of at least 256 bytes, and always
+ // allocate multiples of 256 bytes so all memory addresses are
+ // nicely byte aligned.
+ size_t rounded_bytes = (256 * ((num_bytes + 255) / 256));
+ DCHECK_EQ(0, rounded_bytes % 256);
+
+ // The BFC allocator tries to find the best fit first.
+ //
+ // First identify the first bin that could satisfy rounded_bytes.
+ auto it = bins_.lower_bound(rounded_bytes);
+ if (it == bins_.end()) {
+ LOG(ERROR) << " Asked for " << rounded_bytes << " but largest bin was "
+ << bins_.rbegin()->first;
+ return nullptr;
+ }
+
+ mutex_lock l(lock_);
+ for (; it != bins_.end(); ++it) {
+ // Start searching from the first bin for the smallest chunk that fits
+ // rounded_bytes.
+ Bin* b = it->second;
+ for (GPUBFCAllocator::Chunk* chunk : b->chunks) {
+ if (!chunk->in_use && chunk->size > rounded_bytes) {
+ // We found an existing chunk that fits us that wasn't in use.
+ chunk->in_use = true;
+
+ // If we can break the size of the chunk into two reasonably
+ // large pieces, do so.
+ //
+ // TODO(vrv): What should be the criteria when deciding when
+ // to split?
+ if (chunk->size >= rounded_bytes * 2) {
+ SplitChunk(chunk, rounded_bytes);
+ }
+
+ // The requested size of the returned chunk is what the user
+ // has allocated.
+ chunk->requested_size = num_bytes;
+
+ VLOG(4) << "Returning: " << chunk->ptr;
+ return chunk->ptr;
+ }
+ }
+ }
+
+ // We searched all bins for an existing free chunk to use and
+ // couldn't find one. This means we must have run out of memory,
+ // Dump the memory log for analysis.
+ if (dump_log_on_failure) {
+ DumpMemoryLog(rounded_bytes);
+ LOG(WARNING) << "Ran out of memory trying to allocate "
+ << strings::HumanReadableNumBytes(num_bytes)
+ << ". See logs for memory state";
+ }
+ return nullptr;
+}
+
+void GPUBFCAllocator::SplitChunk(GPUBFCAllocator::Chunk* c, size_t num_bytes) {
+ // Create a new chunk starting num_bytes after c
+ GPUBFCAllocator::Chunk* new_chunk = new GPUBFCAllocator::Chunk();
+ new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
+ VLOG(6) << "Adding to chunk map: " << new_chunk->ptr;
+ ptr_to_chunk_map_.insert(std::make_pair(new_chunk->ptr, new_chunk));
+
+ // Set the new sizes of the chunks.
+ new_chunk->size = c->size - num_bytes;
+ c->size = num_bytes;
+
+ // The new chunk is not in use.
+ new_chunk->in_use = false;
+
+ // Maintain the pointers.
+ // c <-> c_neighbor becomes
+ // c <-> new_chunk <-> c_neighbor
+ GPUBFCAllocator::Chunk* c_neighbor = c->next;
+ new_chunk->prev = c;
+ new_chunk->next = c_neighbor;
+ c->next = new_chunk;
+ if (c_neighbor) {
+ c_neighbor->prev = new_chunk;
+ }
+
+ // Maintain the bins
+ ReassignChunkToBin(new_chunk);
+ ReassignChunkToBin(c);
+}
+
+void GPUBFCAllocator::DeallocateRaw(void* ptr) {
+ retry_helper_.DeallocateRaw([this](void* p) { DeallocateRawInternal(p); },
+ ptr);
+}
+
+void GPUBFCAllocator::DeallocateRawInternal(void* ptr) {
+ if (ptr == nullptr) {
+ LOG(ERROR) << "tried to deallocate nullptr";
+ return;
+ }
+ mutex_lock l(lock_);
+
+ // Find the chunk from the ptr.
+ auto it = ptr_to_chunk_map_.find(ptr);
+ CHECK(it != ptr_to_chunk_map_.end())
+ << "Asked to deallocate a pointer we never allocated: " << ptr;
+
+ GPUBFCAllocator::Chunk* c = it->second;
+ VLOG(6) << "Chunk at " << c->ptr << " no longer in use";
+ // Mark the chunk as no longer in use
+ c->in_use = false;
+
+ // Consider coalescing it.
+ MaybeCoalesce(c);
+}
+
+// Merges c1 and c2 when c1->next is c2 and c2->prev is c1.
+// We merge c2 into c1.
+void GPUBFCAllocator::Merge(GPUBFCAllocator::Chunk* c1,
+ GPUBFCAllocator::Chunk* c2) {
+ // We can only merge chunks that are not in use.
+ DCHECK(!c1->in_use && !c2->in_use);
+
+ // c1's prev doesn't change, still points to the same ptr, and is
+ // still not in use.
+
+ // Fix up neighbor pointers
+ //
+ // c1 <-> c2 <-> c3 should become
+ // c1 <-> c3
+ GPUBFCAllocator::Chunk* c3 = c2->next;
+ c1->next = c3;
+ CHECK(c2->prev == c1);
+ if (c3 != nullptr) {
+ c3->prev = c1;
+ }
+
+ // Set the new size
+ c1->size += c2->size;
+
+ // Delete c2 and cleanup all state
+ RemoveChunkFromBin(c2);
+}
+
+void GPUBFCAllocator::ReassignChunkToBin(GPUBFCAllocator::Chunk* c) {
+ auto it = bins_.lower_bound(c->size);
+ CHECK(it != bins_.end()) << " Tried to reassign to non-existent bin for size "
+ << c->size;
+
+ Bin* new_bin = it->second;
+
+ // If the bin has not changed, do nothing.
+ Bin* old_bin = c->bin;
+ if (old_bin != nullptr && new_bin == old_bin) {
+ return;
+ }
+
+ // The bin has changed. Add the chunk to the new bin and remove
+ // the chunk from the old bin.
+ new_bin->chunks.insert(c);
+ c->bin = new_bin;
+
+ if (old_bin == nullptr) {
+ return;
+ }
+
+ // Remove chunk from old bin
+ for (auto it = old_bin->chunks.begin(); it != old_bin->chunks.end(); ++it) {
+ if (*it == c) {
+ old_bin->chunks.erase(it);
+ return;
+ }
+ }
+ CHECK(false) << "Could not find chunk in old bin";
+}
+
+void GPUBFCAllocator::RemoveChunkFromBin(GPUBFCAllocator::Chunk* c) {
+ Bin* b = c->bin;
+ for (auto it = b->chunks.begin(); it != b->chunks.end(); ++it) {
+ Chunk* other_c = *it;
+ if (other_c->ptr == c->ptr) {
+ b->chunks.erase(it);
+ VLOG(4) << "Removing: " << c->ptr;
+ ptr_to_chunk_map_.erase(c->ptr);
+ delete c;
+ return;
+ }
+ }
+
+ CHECK(false) << "Could not find chunk in bin";
+}
+
+void GPUBFCAllocator::MaybeCoalesce(GPUBFCAllocator::Chunk* c) {
+ // This chunk is no longer in-use, consider coalescing the chunk
+ // with adjacent chunks.
+ Chunk* chunk_to_reassign = nullptr;
+
+ // If the next chunk is free, coalesce the two, if the result would
+ // fit in an existing bin.
+ if (c->next && !c->next->in_use) {
+ VLOG(8) << "Chunk at " << c->next->ptr << " merging with c " << c->ptr;
+
+ chunk_to_reassign = c;
+
+ // Deletes c->next
+ Merge(c, c->next);
+ }
+
+ // If the previous chunk is free, coalesce the two
+ if (c->prev && !c->prev->in_use) {
+ VLOG(8) << "Chunk at " << c->ptr << " merging into c->prev "
+ << c->prev->ptr;
+
+ chunk_to_reassign = c->prev;
+
+ // Deletes c
+ Merge(c->prev, c);
+ }
+
+ // Reassign the final merged chunk into the right bin.
+ if (chunk_to_reassign) {
+ ReassignChunkToBin(chunk_to_reassign);
+ }
+}
+
+void GPUBFCAllocator::AddAllocVisitor(Visitor visitor) {
+ VLOG(1) << "AddVisitor";
+ mutex_lock l(lock_);
+ region_visitors_.push_back(visitor);
+ visitor(base_ptr_, gpu_memory_size_);
+}
+
+bool GPUBFCAllocator::TracksAllocationSizes() { return true; }
+
+size_t GPUBFCAllocator::RequestedSize(void* ptr) {
+ mutex_lock l(lock_);
+ auto it = ptr_to_chunk_map_.find(ptr);
+ CHECK(it != ptr_to_chunk_map_.end())
+ << "Asked for requested size of pointer we never allocated: " << ptr;
+ GPUBFCAllocator::Chunk* c = it->second;
+ return c->requested_size;
+}
+
+size_t GPUBFCAllocator::AllocatedSize(void* ptr) {
+ mutex_lock l(lock_);
+ auto it = ptr_to_chunk_map_.find(ptr);
+ CHECK(it != ptr_to_chunk_map_.end())
+ << "Asked for allocated size of pointer we never allocated: " << ptr;
+ GPUBFCAllocator::Chunk* c = it->second;
+ return c->size;
+}
+
+void GPUBFCAllocator::DumpMemoryLog(size_t num_bytes) {
+ // For each bin: tally up the total number of chunks and bytes.
+ for (auto bit : bins_) {
+ Bin* b = bit.second;
+
+ size_t total_bytes_in_use = 0;
+ size_t total_bytes_in_bin = 0;
+ size_t total_requested_bytes_in_use = 0;
+ size_t total_requested_bytes_in_bin = 0;
+ size_t total_chunks_in_use = 0;
+ size_t total_chunks_in_bin = 0;
+ for (Chunk* c : b->chunks) {
+ total_bytes_in_bin += c->size;
+ total_requested_bytes_in_bin += c->requested_size;
+ ++total_chunks_in_bin;
+ if (c->in_use) {
+ total_bytes_in_use += c->size;
+ total_requested_bytes_in_use += c->requested_size;
+ ++total_chunks_in_use;
+ }
+ }
+
+ LOG(INFO) << "Bin (" << b->bin_size
+ << "): \tTotal Chunks: " << total_chunks_in_bin
+ << ", Chunks in use: " << total_chunks_in_use << " "
+ << strings::HumanReadableNumBytes(total_bytes_in_bin)
+ << " allocated for chunks. "
+ << strings::HumanReadableNumBytes(total_requested_bytes_in_bin)
+ << " client-requested for chunks. "
+ << strings::HumanReadableNumBytes(total_bytes_in_use)
+ << " in use in bin. "
+ << strings::HumanReadableNumBytes(total_requested_bytes_in_use)
+ << " client-requested in use in bin.";
+ }
+
+ // Find the bin that we would have liked to allocate in, so we
+ // can get some further analysis about fragmentation.
+ auto it = bins_.lower_bound(num_bytes);
+ if (it != bins_.end()) {
+ Bin* b = it->second;
+
+ LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
+ << " was " << strings::HumanReadableNumBytes(b->bin_size)
+ << ", Chunk State: ";
+
+ for (Chunk* c : b->chunks) {
+ LOG(INFO) << c->DebugString(true);
+ }
+ }
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
new file mode 100644
index 0000000000..3d1601e132
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -0,0 +1,156 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
+#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// A GPU memory allocator that implements a 'best-fit with coalescing'
+// algorithm. This is essentially a very simple version of Doug Lea's
+// malloc (dlmalloc).
+//
+// The goal of this allocator is to support defragmentation via
+// coalescing. One assumption we make is that the process using this
+// allocator owns pretty much all of the GPU memory, and that nearly
+// all requests to allocate GPU memory go through this interface.
+class GPUBFCAllocator : public VisitableAllocator {
+ public:
+ // 'device_id' refers to the StreamExecutor ID of the device within
+ // the process and must reference a valid ID in the process.
+ explicit GPUBFCAllocator(int device_id, size_t total_memory);
+ ~GPUBFCAllocator() override;
+
+ string Name() override { return "gpu_bfc"; }
+ void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+ void DeallocateRaw(void* ptr) override;
+
+ void AddAllocVisitor(Visitor visitor) override;
+
+ // Does nothing, because gpu memory is never freed.
+ void AddFreeVisitor(Visitor visitor) override {}
+
+ bool TracksAllocationSizes() override;
+
+ size_t RequestedSize(void* ptr) override;
+
+ size_t AllocatedSize(void* ptr) override;
+
+ private:
+ struct Bin;
+
+ void* AllocateRawInternal(size_t alignment, size_t num_bytes,
+ bool dump_log_on_failure);
+ void DeallocateRawInternal(void* ptr);
+
+ // Chunks point to GPU memory. Their prev/next pointers form a
+ // doubly-linked list of addresses sorted by GPU base address that
+ // must be contiguous. Chunks contain information about whether
+ // they are in use or whether they are free, and contain a pointer
+ // to the bin they are in.
+ struct Chunk {
+ size_t size = 0; // Full size of GPU buffer.
+
+ // We sometimes give chunks that are larger than needed to reduce
+ // fragmentation. requested_size keeps track of what the client
+ // actually wanted so we can understand whether our splitting
+ // strategy is efficient.
+ size_t requested_size = 0;
+
+ bool in_use = false;
+ void* ptr = nullptr; // pointer to granted GPU subbuffer.
+
+ // If not null, the memory referred to by 'prev' is directly
+ // preceding the memory used by this chunk. E.g., It should start
+ // at 'ptr - prev->size'
+ Chunk* prev = nullptr;
+
+ // If not null, the memory referred to by 'next' is directly
+ // following the memory used by this chunk. E.g., It should be at
+ // 'ptr + size'
+ Chunk* next = nullptr;
+
+ // What bin are we in?
+ Bin* bin = nullptr;
+
+ string DebugString(bool recurse) {
+ string dbg;
+ strings::StrAppend(&dbg, " Size: ", strings::HumanReadableNumBytes(size),
+ " | Requested Size: ",
+ strings::HumanReadableNumBytes(requested_size),
+ " | in_use: ", in_use);
+ if (recurse && prev) {
+ strings::StrAppend(&dbg, ", prev: ", prev->DebugString(false));
+ }
+ if (recurse && next) {
+ strings::StrAppend(&dbg, ", next: ", next->DebugString(false));
+ }
+ return dbg;
+ }
+ };
+
+ Chunk* AllocateNewChunk(size_t num_bytes);
+ void SplitChunk(Chunk* c, size_t num_bytes);
+ void Merge(Chunk* c1, Chunk* c2);
+ void MaybeCoalesce(Chunk* c);
+
+ void ReassignChunkToBin(Chunk* c);
+ void RemoveChunkFromBin(Chunk* c);
+
+ void DumpMemoryLog(size_t num_bytes);
+
+ // A Bin is a collection of similar-sized Chunks.
+ struct Bin {
+ // All chunks in this bin have >= bin_size memory.
+ size_t bin_size = 0;
+
+ struct ChunkComparator {
+ bool operator()(Chunk* a, Chunk* b) { return a->size < b->size; }
+ };
+
+ // List of chunks within the bin, sorted by chunk size.
+ std::multiset<Chunk*, ChunkComparator> chunks;
+
+ explicit Bin(size_t bs) : bin_size(bs) {}
+
+ ~Bin() { gtl::STLDeleteElements(&chunks); }
+ };
+
+ GPUAllocatorRetry retry_helper_;
+
+ // Structures immutable after construction
+ const int device_id_;
+ // The base pointer where all the GPU memory begins.
+ void* base_ptr_ = nullptr;
+ size_t gpu_memory_size_ = 0;
+
+ // Map from bin size to Bin
+ // After construction, the bin map is never resized.
+ std::map<size_t, Bin*> bins_;
+
+ perftools::gputools::StreamExecutor* stream_exec_; // Not owned.
+
+ // Structures mutable after construction
+ mutable mutex lock_;
+ // Not owned.
+ std::unordered_map<void*, Chunk*> ptr_to_chunk_map_;
+
+ // Called once on each region, ASAP.
+ std::vector<Visitor> region_visitors_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
new file mode 100644
index 0000000000..7b5e8aec1d
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -0,0 +1,166 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/stream_executor/stream_executor.h"
+#include <gtest/gtest.h>
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+namespace {
+
+TEST(GPUBFCAllocatorTest, NoDups) {
+ GPUBFCAllocator a(0, 1 << 30);
+ // Allocate a lot of raw pointers
+ std::vector<void*> ptrs;
+ for (int s = 1; s < 1024; s++) {
+ void* raw = a.AllocateRaw(1, s);
+ ptrs.push_back(raw);
+ }
+
+ std::sort(ptrs.begin(), ptrs.end());
+
+ // Make sure none of them are equal, and that none of them overlap.
+ for (int i = 0; i < ptrs.size(); i++) {
+ if (i > 0) {
+ ASSERT_NE(ptrs[i], ptrs[i - 1]); // No dups
+ size_t req_size = a.RequestedSize(ptrs[i - 1]);
+ ASSERT_GT(req_size, 0);
+ ASSERT_GE(static_cast<char*>(ptrs[i]) - static_cast<char*>(ptrs[i - 1]),
+ req_size);
+ }
+ }
+
+ for (int i = 0; i < ptrs.size(); i++) {
+ a.DeallocateRaw(ptrs[i]);
+ }
+}
+
+TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
+ GPUBFCAllocator a(0, 1 << 30);
+ // Allocate 256 raw pointers of sizes between 100 bytes and about
+ // a meg
+ random::PhiloxRandom philox(123, 17);
+ random::SimplePhilox rand(&philox);
+
+ std::vector<void*> initial_ptrs;
+ for (int s = 1; s < 256; s++) {
+ size_t size = std::min<size_t>(
+ std::max<size_t>(rand.Rand32() % 1048576, 100), 1048576);
+ void* raw = a.AllocateRaw(1, size);
+
+ initial_ptrs.push_back(raw);
+ }
+
+ // Deallocate half of the memory, and keep track of the others.
+ std::vector<void*> existing_ptrs;
+ for (int i = 0; i < initial_ptrs.size(); i++) {
+ if (i % 2 == 1) {
+ a.DeallocateRaw(initial_ptrs[i]);
+ } else {
+ existing_ptrs.push_back(initial_ptrs[i]);
+ }
+ }
+
+ // Allocate a lot of raw pointers
+ for (int s = 1; s < 256; s++) {
+ size_t size = std::min<size_t>(
+ std::max<size_t>(rand.Rand32() % 1048576, 100), 1048576);
+ void* raw = a.AllocateRaw(1, size);
+ existing_ptrs.push_back(raw);
+ }
+
+ std::sort(existing_ptrs.begin(), existing_ptrs.end());
+ // Make sure none of them are equal
+ for (int i = 0; i < existing_ptrs.size(); i++) {
+ if (i > 0) {
+ CHECK_NE(existing_ptrs[i], existing_ptrs[i - 1]); // No dups
+
+ size_t req_size = a.RequestedSize(existing_ptrs[i - 1]);
+ ASSERT_GT(req_size, 0);
+
+ // Check that they don't overlap.
+ ASSERT_GE(static_cast<char*>(existing_ptrs[i]) -
+ static_cast<char*>(existing_ptrs[i - 1]),
+ req_size);
+ }
+ }
+
+ for (int i = 0; i < existing_ptrs.size(); i++) {
+ a.DeallocateRaw(existing_ptrs[i]);
+ }
+}
+
+TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
+ GPUBFCAllocator a(0, 1 << 30);
+
+ float* first_ptr = a.Allocate<float>(1024);
+ a.Deallocate(first_ptr);
+ for (int i = 0; i < 1024; ++i) {
+ // Allocate several buffers of different sizes, and then clean them
+ // all up. We should be able to repeat this endlessly without
+ // causing fragmentation and growth.
+ float* t1 = a.Allocate<float>(1024);
+
+ int64* t2 = a.Allocate<int64>(1048576);
+ double* t3 = a.Allocate<double>(2048);
+ float* t4 = a.Allocate<float>(10485760);
+
+ a.Deallocate(t1);
+ a.Deallocate(t2);
+ a.Deallocate(t3);
+ a.Deallocate(t4);
+ }
+
+ // At the end, we should have coalesced all memory into one region
+ // starting at the beginning, so validate that allocating a pointer
+ // starts from this region.
+ float* first_ptr_after = a.Allocate<float>(1024);
+ EXPECT_EQ(first_ptr, first_ptr_after);
+ a.Deallocate(first_ptr_after);
+}
+
+TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
+ GPUBFCAllocator a(0, 1 << 30);
+ float* ptr = a.Allocate<float>(0);
+ EXPECT_EQ(nullptr, ptr);
+}
+
+TEST(GPUBFCAllocatorTest, TracksSizes) {
+ GPUBFCAllocator a(0, 1 << 30);
+ EXPECT_EQ(true, a.TracksAllocationSizes());
+}
+
+TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
+ GPUBFCAllocator a(0, 1 << 30);
+ float* t1 = a.Allocate<float>(1);
+ EXPECT_EQ(4, a.RequestedSize(t1));
+ EXPECT_EQ(256, a.AllocatedSize(t1));
+ a.Deallocate(t1);
+}
+
+TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
+ // Configure a 1MiB byte limit
+ GPUBFCAllocator a(0, 1 << 20);
+
+ float* first_ptr = a.Allocate<float>(1 << 6);
+ float* second_ptr = a.Allocate<float>(1 << 20);
+
+ EXPECT_NE(nullptr, first_ptr);
+ EXPECT_EQ(nullptr, second_ptr);
+ a.Deallocate(first_ptr);
+}
+
+} // namespace
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
new file mode 100644
index 0000000000..5ec405cd80
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -0,0 +1,186 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
+
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+#define MASK_WORDS 2
+#define MASK_BYTES (MASK_WORDS * sizeof(int64))
+
+namespace {
+
+static int64* NewMask(int64 word) {
+ int64* m = new int64[MASK_WORDS];
+ for (int i = 0; i < MASK_WORDS; ++i) {
+ m[i] = word;
+ }
+ return m;
+}
+
+static int64* before_mask = NewMask(0xabababababababab);
+static int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
+
+bool CheckMask(perftools::gputools::StreamExecutor* exec, void* ptr,
+ int64* mask) {
+ gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
+ int64 tmp[MASK_WORDS];
+
+ if (!exec->SynchronousMemcpy(&tmp, gpu_ptr, MASK_BYTES)) {
+ LOG(FATAL) << "Could not copy debug mask";
+ }
+
+ bool ok = true;
+ for (int i = 0; i < MASK_WORDS; ++i) {
+ ok &= (mask[i] == tmp[i]);
+ if (!ok) {
+ LOG(ERROR) << "i=" << i
+ << " mask=" << reinterpret_cast<const void*>(mask[i])
+ << " field=" << reinterpret_cast<const void*>(tmp[i]);
+ }
+ }
+
+ return ok;
+}
+
+void InitMask(perftools::gputools::StreamExecutor* exec, void* ptr,
+ int64* mask) {
+ gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
+ if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) {
+ LOG(FATAL) << "Could not copy debug mask";
+ }
+}
+
+} // namespace
+
+// -----------------------------------------------------------------------------
+// GPUDebugAllocator
+// -----------------------------------------------------------------------------
+GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator,
+ int device_id)
+ : base_allocator_(allocator) {
+ stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+}
+
+GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
+
+void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
+ num_bytes += (2 * MASK_BYTES);
+
+ void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
+
+ // Return the pointer after the header
+ void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES;
+
+ // Write the header at allocated_ptr
+ InitMask(stream_exec_, allocated_ptr, before_mask);
+
+ // Write the footer at the end.
+ size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
+ InitMask(stream_exec_,
+ static_cast<char*>(allocated_ptr) + req_size - MASK_BYTES,
+ after_mask);
+ return rv;
+}
+void GPUDebugAllocator::DeallocateRaw(void* ptr) {
+ CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
+ CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
+
+ // Backtrack to the beginning of the header.
+ ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
+ // Deallocate the memory
+ base_allocator_->DeallocateRaw(ptr);
+}
+
+void GPUDebugAllocator::AddAllocVisitor(Visitor visitor) {
+ return base_allocator_->AddAllocVisitor(visitor);
+}
+
+void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) {
+ return base_allocator_->AddFreeVisitor(visitor);
+}
+
+bool GPUDebugAllocator::TracksAllocationSizes() { return true; }
+
+size_t GPUDebugAllocator::RequestedSize(void* ptr) {
+ auto req_size =
+ base_allocator_->RequestedSize(static_cast<char*>(ptr) - MASK_BYTES);
+ return req_size - 2 * MASK_BYTES;
+}
+
+size_t GPUDebugAllocator::AllocatedSize(void* ptr) {
+ return base_allocator_->AllocatedSize(static_cast<char*>(ptr) - MASK_BYTES);
+}
+
+bool GPUDebugAllocator::CheckHeader(void* ptr) {
+ return CheckMask(stream_exec_, static_cast<char*>(ptr) - MASK_BYTES,
+ before_mask);
+}
+
+bool GPUDebugAllocator::CheckFooter(void* ptr) {
+ char* original_ptr = static_cast<char*>(ptr) - MASK_BYTES;
+ size_t req_size = base_allocator_->RequestedSize(original_ptr);
+ return CheckMask(stream_exec_, original_ptr + req_size - MASK_BYTES,
+ after_mask);
+}
+
+// -----------------------------------------------------------------------------
+// GPUNanResetAllocator
+// -----------------------------------------------------------------------------
+GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator,
+ int device_id)
+ : base_allocator_(allocator) {
+ stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+}
+
+GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
+
+void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
+ void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
+
+ // Initialize the buffer to Nans
+ size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
+ std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
+ gpu::DeviceMemory<float> nan_ptr{
+ gpu::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
+
+ if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
+ LOG(ERROR) << "Could not initialize to NaNs";
+ }
+
+ return allocated_ptr;
+}
+void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
+ // Reset the buffer to Nans
+ size_t req_size = base_allocator_->RequestedSize(ptr);
+ std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
+ gpu::DeviceMemory<float> nan_ptr{
+ gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
+ if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
+ LOG(ERROR) << "Could not initialize to NaNs";
+ }
+
+ // Deallocate the memory
+ base_allocator_->DeallocateRaw(ptr);
+}
+
+void GPUNanResetAllocator::AddAllocVisitor(Visitor visitor) {
+ return base_allocator_->AddAllocVisitor(visitor);
+}
+
+void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) {
+ return base_allocator_->AddFreeVisitor(visitor);
+}
+
+size_t GPUNanResetAllocator::RequestedSize(void* ptr) {
+ return base_allocator_->RequestedSize(ptr);
+}
+
+size_t GPUNanResetAllocator::AllocatedSize(void* ptr) {
+ return base_allocator_->AllocatedSize(ptr);
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
new file mode 100644
index 0000000000..c9b564ffc4
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -0,0 +1,68 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace tensorflow {
+
+// An allocator that wraps a GPU allocator and adds debugging
+// functionality that verifies that users do not write outside their
+// allocated memory.
+class GPUDebugAllocator : public VisitableAllocator {
+ public:
+ explicit GPUDebugAllocator(VisitableAllocator* allocator, int device_id);
+ ~GPUDebugAllocator() override;
+ string Name() override { return "gpu_debug"; }
+ void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+ void DeallocateRaw(void* ptr) override;
+ void AddAllocVisitor(Visitor visitor) override;
+ void AddFreeVisitor(Visitor visitor) override;
+ bool TracksAllocationSizes() override;
+ size_t RequestedSize(void* ptr) override;
+ size_t AllocatedSize(void* ptr) override;
+
+ // For testing.
+ bool CheckHeader(void* ptr);
+ bool CheckFooter(void* ptr);
+
+ private:
+ VisitableAllocator* base_allocator_ = nullptr; // owned
+
+ perftools::gputools::StreamExecutor* stream_exec_; // Not owned.
+
+ TF_DISALLOW_COPY_AND_ASSIGN(GPUDebugAllocator);
+};
+
+// An allocator that wraps a GPU allocator and resets the memory on
+// allocation and free to 'NaN', helping to identify cases where the
+// user forgets to initialize the memory.
+class GPUNanResetAllocator : public VisitableAllocator {
+ public:
+ explicit GPUNanResetAllocator(VisitableAllocator* allocator, int device_id);
+ ~GPUNanResetAllocator() override;
+ string Name() override { return "gpu_nan_reset"; }
+ void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+ void DeallocateRaw(void* ptr) override;
+ void AddAllocVisitor(Visitor visitor) override;
+ void AddFreeVisitor(Visitor visitor) override;
+ size_t RequestedSize(void* ptr) override;
+ size_t AllocatedSize(void* ptr) override;
+
+ private:
+ VisitableAllocator* base_allocator_ = nullptr; // owned
+
+ perftools::gputools::StreamExecutor* stream_exec_; // Not owned.
+
+ TF_DISALLOW_COPY_AND_ASSIGN(GPUNanResetAllocator);
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
new file mode 100644
index 0000000000..5f63906576
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -0,0 +1,207 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include <gtest/gtest.h>
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
+ const int device_id = 0;
+ GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30), device_id);
+ auto stream_exec =
+ GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+
+ for (int s : {8}) {
+ std::vector<int64> cpu_array(s);
+ memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
+ int64* gpu_array = a.Allocate<int64>(cpu_array.size());
+ gpu::DeviceMemory<int64> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}};
+ ASSERT_TRUE(stream_exec->SynchronousMemcpy(&gpu_array_ptr, &cpu_array[0],
+ s * sizeof(int64)));
+ EXPECT_TRUE(a.CheckHeader(gpu_array));
+ EXPECT_TRUE(a.CheckFooter(gpu_array));
+
+ // Confirm no error on free.
+ a.DeallocateRaw(gpu_array);
+ }
+}
+
+TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
+ for (int s : {8, 211}) {
+ EXPECT_DEATH(
+ {
+ const int device_id = 0;
+ GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30),
+ device_id);
+ auto stream_exec =
+ GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+
+ std::vector<int64> cpu_array(s);
+ memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
+ int64* gpu_array = a.Allocate<int64>(cpu_array.size());
+
+ gpu::DeviceMemory<int64> gpu_array_ptr{
+ gpu::DeviceMemoryBase{gpu_array}};
+ ASSERT_TRUE(stream_exec->SynchronousMemcpy(
+ &gpu_array_ptr, &cpu_array[0], cpu_array.size() * sizeof(int64)));
+
+ gpu::DeviceMemory<int64> gpu_hdr_ptr{
+ gpu::DeviceMemoryBase{gpu_array - 1}};
+ // Clobber first word of the header.
+ float pi = 3.1417;
+ ASSERT_TRUE(
+ stream_exec->SynchronousMemcpy(&gpu_hdr_ptr, &pi, sizeof(float)));
+
+ // Expect error on free.
+ a.Deallocate(gpu_array);
+ },
+ "");
+ }
+}
+
+TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
+ for (int s : {8, 22}) {
+ EXPECT_DEATH(
+ {
+ const int device_id = 0;
+ GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30),
+ device_id);
+ auto stream_exec =
+ GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+
+ std::vector<int64> cpu_array(s);
+ memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
+ int64* gpu_array = a.Allocate<int64>(cpu_array.size());
+
+ gpu::DeviceMemory<int64> gpu_array_ptr{
+ gpu::DeviceMemoryBase{gpu_array}};
+ ASSERT_TRUE(stream_exec->SynchronousMemcpy(
+ &gpu_array_ptr, &cpu_array[0], cpu_array.size() * sizeof(int64)));
+
+ // Clobber word of the footer.
+ gpu::DeviceMemory<int64> gpu_ftr_ptr{
+ gpu::DeviceMemoryBase{gpu_array + s}};
+ float pi = 3.1417;
+ ASSERT_TRUE(
+ stream_exec->SynchronousMemcpy(&gpu_ftr_ptr, &pi, sizeof(float)));
+
+ // Expect error on free.
+ a.Deallocate(gpu_array);
+ },
+ "");
+ }
+}
+
+TEST(GPUDebugAllocatorTest, ResetToNan) {
+ const int device_id = 0;
+ GPUNanResetAllocator a(new GPUBFCAllocator(device_id, 1 << 30), device_id);
+ auto stream_exec =
+ GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+
+ std::vector<float> cpu_array(1024);
+ std::vector<float> cpu_array_result(1024);
+
+ // Allocate 1024 floats
+ float* gpu_array = a.Allocate<float>(cpu_array.size());
+ gpu::DeviceMemory<float> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}};
+ ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr,
+ cpu_array.size() * sizeof(float)));
+ for (float f : cpu_array) {
+ ASSERT_FALSE(std::isfinite(f));
+ }
+
+ // Set one of the fields to 1.0.
+ cpu_array[0] = 1.0;
+ ASSERT_TRUE(stream_exec->SynchronousMemcpy(&gpu_array_ptr, &cpu_array[0],
+ cpu_array.size() * sizeof(float)));
+ // Copy the data back and verify.
+ ASSERT_TRUE(
+ stream_exec->SynchronousMemcpy(&cpu_array_result[0], gpu_array_ptr,
+ cpu_array_result.size() * sizeof(float)));
+ ASSERT_EQ(1.0, cpu_array_result[0]);
+
+ // Free the array
+ a.Deallocate(gpu_array);
+
+ // All values should be reset to nan.
+ ASSERT_TRUE(
+ stream_exec->SynchronousMemcpy(&cpu_array_result[0], gpu_array_ptr,
+ cpu_array_result.size() * sizeof(float)));
+ for (float f : cpu_array_result) {
+ ASSERT_FALSE(std::isfinite(f));
+ }
+}
+
+TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
+ const int device_id = 0;
+ // NaN reset must be the outer-most allocator.
+ GPUNanResetAllocator a(
+ new GPUDebugAllocator(new GPUBFCAllocator(device_id, 1 << 30), device_id),
+ device_id);
+ auto stream_exec =
+ GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+
+ std::vector<float> cpu_array(1024);
+ std::vector<float> cpu_array_result(1024);
+
+ // Allocate 1024 floats
+ float* gpu_array = a.Allocate<float>(cpu_array.size());
+ gpu::DeviceMemory<float> gpu_array_ptr{gpu::DeviceMemoryBase{gpu_array}};
+ ASSERT_TRUE(stream_exec->SynchronousMemcpy(&cpu_array[0], gpu_array_ptr,
+ cpu_array.size() * sizeof(float)));
+ for (float f : cpu_array) {
+ ASSERT_FALSE(std::isfinite(f));
+ }
+
+ // Set one of the fields to 1.0.
+ cpu_array[0] = 1.0;
+ ASSERT_TRUE(stream_exec->SynchronousMemcpy(&gpu_array_ptr, &cpu_array[0],
+ cpu_array.size() * sizeof(float)));
+ // Copy the data back and verify.
+ ASSERT_TRUE(
+ stream_exec->SynchronousMemcpy(&cpu_array_result[0], gpu_array_ptr,
+ cpu_array_result.size() * sizeof(float)));
+ ASSERT_EQ(1.0, cpu_array_result[0]);
+
+ // Free the array
+ a.Deallocate(gpu_array);
+
+ // All values should be reset to nan.
+ ASSERT_TRUE(
+ stream_exec->SynchronousMemcpy(&cpu_array_result[0], gpu_array_ptr,
+ cpu_array_result.size() * sizeof(float)));
+ for (float f : cpu_array_result) {
+ ASSERT_FALSE(std::isfinite(f));
+ }
+}
+
+TEST(GPUDebugAllocatorTest, TracksSizes) {
+ GPUDebugAllocator a(new GPUBFCAllocator(0, 1 << 30), 0);
+ EXPECT_EQ(true, a.TracksAllocationSizes());
+}
+
+TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
+ GPUNanResetAllocator a(
+ new GPUDebugAllocator(new GPUBFCAllocator(0, 1 << 30), 0), 0);
+ float* t1 = a.Allocate<float>(1);
+ EXPECT_EQ(4, a.RequestedSize(t1));
+ EXPECT_EQ(256, a.AllocatedSize(t1));
+ a.Deallocate(t1);
+}
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
new file mode 100644
index 0000000000..26d34645f1
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -0,0 +1,651 @@
+// TODO(opensource): Use a more generic sounding preprocessor name than
+// GOOGLE_CUDA
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+//#include "base/commandlineflags.h"
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+#if defined(PLATFORM_GOOGLE)
+DEFINE_bool(brain_gpu_sync_every_op, false,
+ "If true, call GPUUtil::Sync() between every dispatched opkernel.");
+
+DEFINE_int32(brain_gpu_max_streams, 1,
+ "Max number of GPU streams to use for computation.");
+#else
+// TODO(opensource): These should be made options in some options struct,
+// rather than flags.
+bool FLAGS_brain_gpu_sync_every_op = false;
+tensorflow::int32 FLAGS_brain_gpu_max_streams = 1;
+#endif
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+// Eigen Ops directly allocate memory only for temporary buffers used
+// during OpKernel::Compute(). The recommended way of allocating such
+// memory is via OpKernelContext::allocate_temp(). However, Eigen Ops
+// don't have access to OpKernelContext, instead they get access to
+// memory directly through the device allocator. As an Open Source
+// project, Eigen assumes allocator semantics similar to those of the
+// CUDA memory allocator, and may not work correctly due to race
+// conditions if used with some other allocator. For safety, we need
+// to delay deallocation calls out of Eigen until all events on the
+// corresponding stream have completed. The following two classes
+// serve this purpose in two different compilation environments.
+
+#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__)
+class EigenAllocator : public ::Eigen::Allocator {
+ public:
+ explicit EigenAllocator(gpu::Stream* stream, ::tensorflow::Allocator* alloc,
+ EventMgr* em)
+ : stream_(stream), allocator_(alloc), em_(em) {}
+
+ void* allocate(size_t num_bytes) const override {
+ void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes);
+ // Eigen doesn't typically check the return pointer from allocate,
+ // so we do it here and die with a more helpful error message.
+ if (ret == nullptr) {
+ LOG(FATAL) << "EigenAllocator for GPU ran out of memory when allocating "
+ << num_bytes << ". See error logs for more detailed info.";
+ }
+ return ret;
+ }
+
+ void deallocate(void* buffer) const override {
+ em_->ThenDeleteBuffer(stream_, {allocator_, buffer});
+ }
+
+ private:
+ gpu::Stream* stream_; // Not owned.
+ ::tensorflow::Allocator* allocator_; // Not owned.
+ ::tensorflow::EventMgr* em_; // Not owned.
+
+ TF_DISALLOW_COPY_AND_ASSIGN(EigenAllocator);
+};
+
+#else
+class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
+ public:
+ EigenCudaStreamDevice(const cudaStream_t* cuda_stream, int gpu_id,
+ ::tensorflow::Allocator* alloc)
+ : stream_(cuda_stream), allocator_(alloc) {
+ Eigen::initializeDeviceProp();
+ device_prop_ = &Eigen::m_deviceProperties[gpu_id];
+ }
+
+ const cudaStream_t& stream() const override { return *stream_; }
+ const cudaDeviceProp& deviceProperties() const override {
+ return *device_prop_;
+ }
+
+ void* allocate(size_t num_bytes) const override {
+ void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes);
+ if (ret == nullptr) {
+ LOG(FATAL) << "EigenAllocator for GPU ran out of memory when allocating "
+ << num_bytes << ". See error logs for more detailed info.";
+ }
+
+ return ret;
+ }
+ void deallocate(void* buffer) const override {
+ AsyncFreeData* afData = new AsyncFreeData(allocator_, buffer);
+ cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0);
+ CHECK_EQ(err, cudaSuccess);
+ }
+
+ private:
+ struct AsyncFreeData {
+ AsyncFreeData(::tensorflow::Allocator* a, void* p)
+ : allocator_(a), address_(p) {}
+ ::tensorflow::Allocator* allocator_;
+ void* address_;
+ };
+
+ static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status,
+ void* userData) {
+ AsyncFreeData* data = static_cast<AsyncFreeData*>(userData);
+ data->allocator_->DeallocateRaw(data->address_);
+ delete data;
+ }
+
+ const cudaStream_t* stream_; // Not owned.
+ const cudaDeviceProp* device_prop_; // Not owned.
+ ::tensorflow::Allocator* allocator_; // Not owned.
+
+ TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
+};
+
+#endif
+
+BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
+ Bytes memory_limit, BusAdjacency bus_adjacency,
+ int gpu_id, const string& physical_device_desc,
+ Allocator* gpu_allocator, Allocator* cpu_allocator)
+ : LocalDevice(options, Device::BuildDeviceAttributes(
+ name, DEVICE_GPU, memory_limit, bus_adjacency,
+ physical_device_desc),
+ gpu_allocator),
+ gpu_allocator_(gpu_allocator),
+ cpu_allocator_(cpu_allocator),
+ gpu_id_(gpu_id) {
+ gpu::StreamExecutor* executor =
+ GPUMachineManager()->ExecutorForDevice(gpu_id_).ValueOrDie();
+ if (!executor) {
+ LOG(ERROR) << "Failed to get StreamExecutor for device " << gpu_id_;
+ return;
+ }
+ em_.reset(new EventMgr(executor));
+
+ if (FLAGS_brain_gpu_max_streams < 1) {
+ LOG(FATAL) << "Invalid value for brain_gpu_max_streams.";
+ }
+
+ // Create the specified number of GPU streams
+ for (int i = 0; i < FLAGS_brain_gpu_max_streams; i++) {
+ auto stream = new gpu::Stream(executor);
+ stream->Init();
+ VLOG(2) << "Created stream[" << i << "] = " << stream;
+ streams_.push_back(stream);
+ device_contexts_.push_back(new GPUDeviceContext(i, stream));
+ }
+ gpu_device_info_ = new GpuDeviceInfo;
+ gpu_device_info_->stream = streams_[0];
+ gpu_device_info_->default_context = device_contexts_[0];
+ gpu_device_info_->event_mgr = em_.get();
+ set_tensorflow_gpu_device_info(gpu_device_info_);
+}
+
+BaseGPUDevice::~BaseGPUDevice() {
+ delete gpu_device_info_;
+ for (auto ctx : device_contexts_) ctx->Unref();
+ gtl::STLDeleteElements(&streams_);
+}
+
+Status BaseGPUDevice::FillContextMap(const Graph* graph,
+ DeviceContextMap* device_context_map) {
+ VLOG(2) << "FillContextMap";
+
+ const auto num_streams = streams_.size();
+ // Special case for single stream.
+ if (num_streams == 1) {
+ return Status::OK();
+ }
+ const int64 before = Env::Default()->NowMicros();
+ gpu_stream_util::AssignStreamsOpts opts;
+ opts.max_streams = num_streams;
+ std::unordered_map<int, int> node_to_stream_id;
+ TF_RETURN_IF_ERROR(
+ gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id));
+ int64 elapsed = Env::Default()->NowMicros() - before;
+ VLOG(3) << "AssignStreams took " << elapsed << "us";
+
+ // Fill in the context map. It is OK for this map to contain
+ // duplicate DeviceContexts so long as we increment the refcount.
+ for (Node* n : graph->nodes()) {
+ auto mapped_stream = node_to_stream_id[n->id()];
+ CHECK_LE(mapped_stream, num_streams);
+ auto ctx = device_contexts_[mapped_stream];
+ VLOG(3) << "Assigned stream " << node_to_stream_id[n->id()]
+ << " ==> stream[" << ctx->stream_id() << "] for node id " << n->id()
+ << " " << n->type_string() << " " << n->name();
+ ctx->Ref();
+ device_context_map->insert(std::make_pair(n->id(), ctx));
+ }
+
+ return Status::OK();
+}
+
+void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
+ // ScopedActivity is cheap when tracing is not active, but we
+ // can avoid computing the Hash64.
+ // TODO(pbar) This would no longer be needed if Ops have a unique id.
+ const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : 0;
+ port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
+ id);
+
+ GPUDeviceContext* gpu_device_context = device_contexts_[0];
+ if (context->op_device_context() != nullptr) {
+ gpu_device_context =
+ static_cast<GPUDeviceContext*>(context->op_device_context());
+ }
+ gpu::Stream* stream = gpu_device_context->stream();
+ const auto stream_id = gpu_device_context->stream_id();
+
+ VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op "
+ << op_kernel->def().op() << " on GPU" << gpu_id_ << " stream["
+ << stream_id << "]";
+
+ // NOTE(tucker): We need to discriminate between Eigen GPU
+ // operations and all others. If an operation is Eigen
+ // implemented (or otherwise tries to launch a cuda kernel
+ // directly), we need to establish a stacked-scoped environment
+ // that directs it to execute on the proper device. Otherwise we
+ // expect the Op to use StreamExecutor directly and correctly. The
+ // way we make this discrimination is quite hacky: At the moment
+ // the only non-Eigen GPU Op is the recv-op, which is known to be
+ // asynchronous.
+ if (op_kernel->type_string() == "_Recv") {
+ context->SetStatus(errors::Internal(
+ "Invalid synchronous 'Compute' on GPU for '_Recv' op"));
+ } else {
+ const string label =
+ strings::StrCat(op_kernel->name(), ":", op_kernel->type_string());
+ port::Tracing::ScopedAnnotation annotation(label);
+
+ const auto num_streams = streams_.size();
+ if (num_streams > 1) {
+ // If this op's device context is different from the other contexts,
+ // we must wait on the stream.
+ for (int i = 0; i < context->num_inputs(); ++i) {
+ const GPUDeviceContext* idc =
+ static_cast<GPUDeviceContext*>(context->input_device_context(i));
+ OP_REQUIRES(context, idc != nullptr,
+ errors::Internal("Input device context ", i,
+ " was not set properly."));
+ if (VLOG_IS_ON(2)) {
+ const void* base;
+ size_t len;
+ if (context->has_input(i)) {
+ if (IsRefType(context->input_dtype(i))) {
+ Tensor tensor = context->mutable_input(i, false);
+ base = DMAHelper::base(&tensor);
+ len = tensor.TotalBytes();
+ } else {
+ const Tensor& tensor = context->input(i);
+ base = DMAHelper::base(&tensor);
+ len = tensor.TotalBytes();
+ }
+ VLOG(2) << "Input " << i << " " << base << " " << len;
+ VLOG(2) << " stream[" << stream_id << "].ThenWaitFor(stream["
+ << idc->stream_id() << "])"
+ << ((idc->stream() == stream) ? " not needed" : "");
+ }
+ }
+ if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
+ }
+ }
+ gpu::cuda::ScopedActivateExecutorContext scoped_activation{
+ stream->parent(), gpu::cuda::MultiOpActivation::kYes};
+ // Keep a copy of the inputs before Compute runs, in case they get
+ // deleted. TODO(misard) this will be fixed when the tracking is
+ // done right.
+ std::vector<Tensor>* tensor_refs = nullptr;
+ if (!FLAGS_brain_gpu_sync_every_op) {
+ tensor_refs = new std::vector<Tensor>;
+ tensor_refs->reserve(context->num_inputs() + context->num_outputs());
+ for (int ii = 0; ii < context->num_inputs(); ++ii) {
+ if (context->has_input(ii)) {
+ if (IsRefType(context->input_dtype(ii))) {
+ Tensor in = context->mutable_input(ii, false);
+ tensor_refs->push_back(in);
+ } else {
+ const Tensor& in = context->input(ii);
+ tensor_refs->push_back(in);
+ }
+ }
+ }
+ }
+ op_kernel->Compute(context);
+ if (context->status().ok()) {
+ if (FLAGS_brain_gpu_sync_every_op) {
+ // Note: GPUUtil::Sync() only syncs the default stream.
+ // We need to either sync the stream used by this op, or
+ // all streams. Given that this flag is typically used for
+ // debugging it makes more sense to sync all GPU activity.
+ context->SetStatus(GPUUtil::SyncAll(this));
+ } else {
+ // The GPU kernel has been queued, but may not complete for some
+ // time. As soon as this function completes, the caller will
+ // discard its refs on the inputs, outputs and any scratch
+ // tensors it created. Create additional refs here that will be
+ // held until the kernel completes.
+ for (int ii = 0; ii < context->num_temps(); ++ii) {
+ Tensor* temp = context->temp(ii);
+ VLOG(2) << "Saving ref to temp Tensor @ " << DMAHelper::base(temp);
+ tensor_refs->push_back(*temp);
+ }
+ for (int ii = 0; ii < context->num_outputs(); ++ii) {
+ Tensor* temp = context->mutable_output(ii);
+ if (nullptr != temp) {
+ tensor_refs->push_back(*temp);
+ }
+ }
+ em_->ThenDeleteTensors(stream, tensor_refs);
+ }
+ } else {
+ if (!FLAGS_brain_gpu_sync_every_op) {
+ delete tensor_refs;
+ }
+ }
+ }
+}
+
+Status BaseGPUDevice::Sync() { return GPUUtil::Sync(this); }
+
+void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
+ OpKernelContext* context,
+ AsyncOpKernel::DoneCallback done) {
+ GPUDeviceContext* gpu_device_context = device_contexts_[0];
+ if (context->op_device_context() != nullptr) {
+ gpu_device_context =
+ static_cast<GPUDeviceContext*>(context->op_device_context());
+ }
+ const auto stream_id = gpu_device_context->stream_id();
+
+ VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
+ << op_kernel->def().op() << " on GPU" << gpu_id_ << " stream["
+ << stream_id << "]";
+
+ port::Tracing::TraceMe activity(
+ strings::StrCat(op_kernel->name(), ":", op_kernel->type_string()));
+ op_kernel->ComputeAsync(context, done);
+}
+
+Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
+ const AllocatorAttributes alloc_attrs,
+ Tensor* tensor) {
+ AllocatorAttributes attr;
+ attr.set_on_host(true);
+ attr.set_gpu_compatible(true);
+ Allocator* host_alloc = GetAllocator(attr);
+ Tensor parsed(tensor_proto.dtype());
+ if (!parsed.FromProto(host_alloc, tensor_proto)) {
+ return errors::InvalidArgument("Cannot parse tensor from proto: ",
+ tensor_proto.DebugString());
+ }
+ Status status;
+ if (alloc_attrs.on_host()) {
+ *tensor = parsed;
+ } else {
+ if (!DMAHelper::CanUseDMA(&parsed)) {
+ return errors::Internal("GPU copy from non-DMA ",
+ DataTypeString(parsed.dtype()), " tensor");
+ }
+ Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
+ port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto");
+ Notification n;
+ device_contexts_[0]->CopyCPUTensorToDevice(&parsed, this, &copy,
+ [&n, &status](const Status& s) {
+ status = s;
+ n.Notify();
+ });
+ n.WaitForNotification();
+ *tensor = copy;
+ }
+ return status;
+}
+
+namespace {
+#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__)
+class ConcretePerOpGpuDevice : public PerOpGpuDevice {
+ public:
+ explicit ConcretePerOpGpuDevice(gpu::Stream* stream,
+ EigenAllocator* allocator)
+ : device_(stream, allocator), allocator_(allocator) {}
+ ~ConcretePerOpGpuDevice() { delete allocator_; }
+
+ const Eigen::GpuDevice& device() const override { return device_; }
+
+ private:
+ Eigen::GpuDevice device_;
+ EigenAllocator* allocator_;
+};
+#else
+class ConcretePerOpGpuDevice : public PerOpGpuDevice {
+ public:
+ explicit ConcretePerOpGpuDevice(EigenCudaStreamDevice* stream_device)
+ : device_(stream_device), stream_device_(stream_device) {}
+ ~ConcretePerOpGpuDevice() { delete stream_device_; }
+
+ const Eigen::GpuDevice& device() const override { return device_; }
+
+ private:
+ Eigen::GpuDevice device_;
+ EigenCudaStreamDevice* stream_device_;
+};
+#endif
+} // namespace
+
+const PerOpGpuDevice* BaseGPUDevice::NewDevice(int stream_id,
+ Allocator* allocator) {
+#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__)
+ auto ea = new EigenAllocator(streams_[stream_id], allocator, em_.get());
+ return new ConcretePerOpGpuDevice(streams_[stream_id], ea);
+#else
+ const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
+ streams_[stream_id]->implementation()->CudaStreamMemberHack());
+ auto es = new EigenCudaStreamDevice(cuda_stream, gpu_id_, allocator);
+ return new ConcretePerOpGpuDevice(es);
+#endif
+}
+
+const PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice(DeviceContext* dc,
+ Allocator* allocator) {
+ if (dc) {
+ const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
+ const int stream_id = gpu_dc->stream_id();
+ VLOG(1) << " eigen_gpu_device(" << dc << ") => stream[" << stream_id
+ << "]";
+ CHECK_LT(stream_id, streams_.size());
+ return NewDevice(stream_id, allocator);
+ } else {
+ return NewDevice(0, allocator);
+ }
+}
+
+void BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
+ const string& name_prefix,
+ std::vector<Device*>* devices) {
+ int n = INT_MAX;
+ auto iter = options.config.device_count().find("GPU");
+ if (iter != options.config.device_count().end()) {
+ n = iter->second;
+ }
+ std::vector<int> valid_gpu_ids;
+ GetValidDeviceIds(&valid_gpu_ids);
+ if (static_cast<size_t>(n) > valid_gpu_ids.size()) {
+ n = valid_gpu_ids.size();
+ }
+ for (int i = 0; i < n; i++) {
+ devices->push_back(CreateGPUDevice(
+ options, strings::StrCat(name_prefix, "/gpu:", i), valid_gpu_ids[i]));
+ }
+}
+
+namespace {
+int64 MinSystemMemory(int64 available_memory) {
+ // We use the following heuristic for now:
+ //
+ // If the available_memory is < 2GiB, we allocate 200MiB to system memory.
+ // Otherwise, allocate 300MiB to system memory.
+ //
+ // In the future we could be more sophisticated by using a table of
+ // devices.
+ if (available_memory < (1LL << 31)) {
+ // 200MiB
+ return 209715200LL;
+ } else {
+ // max(300 MiB, 0.95 * available_memory)
+ return std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
+ }
+}
+} // namespace
+
+static string GetShortDeviceDescription(int device_id,
+ const gpu::DeviceDescription& desc) {
+ return strings::StrCat("device: ", device_id, ", name: ", desc.name(),
+ ", pci bus id: ", desc.pci_bus_id());
+}
+
+LocalDevice* BaseGPUDeviceFactory::CreateGPUDevice(
+ const SessionOptions& options, const string& name, int gpu_id) {
+ CHECK_GE(gpu_id, 0);
+
+ // Look up the device, to see its attributes.
+ gpu::Platform* gpu_platform = GPUMachineManager();
+ CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount());
+ gpu::StreamExecutor* se =
+ gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+ const gpu::DeviceDescription& desc = se->GetDeviceDescription();
+
+ int64 total_memory, available_memory;
+ CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory));
+
+ int64 allocated_memory = available_memory;
+ double config_memory_fraction =
+ options.config.gpu_options().per_process_gpu_memory_fraction();
+ if (config_memory_fraction == 0) {
+ const int64 min_system_memory = MinSystemMemory(available_memory);
+ if (min_system_memory < allocated_memory) {
+ allocated_memory -= min_system_memory;
+ }
+ } else {
+ allocated_memory *= config_memory_fraction;
+ }
+
+ Bytes allocated_bytes = static_cast<Bytes>(allocated_memory);
+
+ // Get GPU BusAdjacency from its reported NUMA affinity.
+ // Because GPUs are virtualized in some environments, we can't just
+ // use the GPU id.
+ BusAdjacency bus_adjacency = BUS_ANY;
+ switch (desc.numa_node()) {
+ case 0:
+ bus_adjacency = BUS_0;
+ break;
+ case 1:
+ bus_adjacency = BUS_1;
+ break;
+ default:
+ bus_adjacency = BUS_ANY;
+ }
+ VLOG(1) << "GPUDevice id " << gpu_id << " on bus " << bus_adjacency
+ << " numa: " << desc.numa_node() << " pci: " << desc.pci_bus_id();
+
+ ProcessState* process_state = ProcessState::singleton();
+ return CreateGPUDevice(
+ options, name, allocated_bytes, bus_adjacency, gpu_id,
+ GetShortDeviceDescription(gpu_id, desc),
+ process_state->GetGPUAllocator(gpu_id, allocated_memory),
+ process_state->GetCPUAllocator(desc.numa_node()));
+}
+
+static int GetMinGPUMultiprocessorCount() {
+ static const int kDefaultMinGPUMultiprocessorCount = 8;
+
+ const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
+
+ if (tf_min_gpu_core_count == nullptr ||
+ strcmp(tf_min_gpu_core_count, "") == 0) {
+ return kDefaultMinGPUMultiprocessorCount;
+ }
+
+ int min_gpu_core_count = -1;
+ if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) {
+ if (min_gpu_core_count >= 0) {
+ return min_gpu_core_count;
+ }
+ }
+
+ LOG(ERROR) << "Invalid minimum GPU multiprocessor count: ["
+ << tf_min_gpu_core_count << "]. "
+ << "Using the default value: "
+ << kDefaultMinGPUMultiprocessorCount;
+ return kDefaultMinGPUMultiprocessorCount;
+}
+
+void BaseGPUDeviceFactory::GetValidDeviceIds(std::vector<int>* ids) {
+ auto gpu_manager = GPUMachineManager();
+ int min_gpu_core_count = GetMinGPUMultiprocessorCount();
+ if (gpu_manager) {
+ auto visible_device_count = gpu_manager->VisibleDeviceCount();
+ for (int i = 0; i < gpu_manager->VisibleDeviceCount(); ++i) {
+ auto exec_status = gpu_manager->ExecutorForDevice(i);
+ if (!exec_status.ok()) {
+ continue;
+ }
+ gpu::StreamExecutor* se = exec_status.ValueOrDie();
+ const gpu::DeviceDescription& desc = se->GetDeviceDescription();
+ int major, minor;
+ if (!desc.cuda_compute_capability(&major, &minor)) {
+ continue;
+ }
+ // Only consider GPUs with compute capability >= 3.5 (Kepler or
+ // higher)
+ if (major < 3 || (major == 3 && minor < 5)) {
+ LOG(INFO) << "Ignoring gpu device "
+ << "(" << GetShortDeviceDescription(i, desc) << ") "
+ << "with Cuda compute capability " << major << "." << minor
+ << ". The minimum required Cuda capability is 3.5.";
+ continue;
+ }
+
+ // TensorFlow currently places computation on devices assuming
+ // they have similar capability.
+ //
+ // If there are multiple GPUs available on the machine, only
+ // consider GPUs with 8 or more multiprocessors.
+ //
+ // TODO(vrv): In the medium term: we should only filter out GPUs
+ // that are slow relative to the fastest GPU. In the long term,
+ // TensorFlow should support automatic placement based on
+ // capability.
+ if (visible_device_count > 1) {
+ if (desc.core_count() < min_gpu_core_count) {
+ LOG(INFO) << "Ignoring gpu device "
+ << "(" << GetShortDeviceDescription(i, desc) << ") "
+ << "with Cuda multiprocessor count: " << desc.core_count()
+ << ". The minimum required count is " << min_gpu_core_count
+ << ". You can adjust this requirement with the env var "
+ "TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
+ continue;
+ }
+ }
+
+ int new_id = ids->size();
+ ids->push_back(i);
+
+ LOG(INFO) << "Creating TensorFlow device (/gpu:" << new_id << ") -> "
+ << "(" << GetShortDeviceDescription(i, desc) << ")";
+ }
+ }
+}
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
new file mode 100644
index 0000000000..a415224d95
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -0,0 +1,94 @@
+#if !GOOGLE_CUDA
+#error This file must only be included when building with Cuda support
+#endif
+
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+class EigenAllocator;
+
+class BaseGPUDevice : public LocalDevice {
+ public:
+ BaseGPUDevice(const SessionOptions& options, const string& name,
+ Bytes memory_limit, BusAdjacency bus_adjacency, int gpu_id,
+ const string& physical_device_desc, Allocator* gpu_allocator,
+ Allocator* cpu_allocator);
+
+ ~BaseGPUDevice() override;
+
+ // GPU devices require the Op Compute method to save a reference to
+ // any temporary tensors that are allocated until the Op execution
+ // completes.
+ bool SaveTemporaryTensors() const override { return true; }
+
+ Status FillContextMap(const Graph* graph,
+ DeviceContextMap* device_context_map);
+
+ void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+
+ Status Sync() override;
+
+ void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+ AsyncOpKernel::DoneCallback done) override;
+
+ Status MakeTensorFromProto(const TensorProto& tensor_proto,
+ const AllocatorAttributes alloc_attrs,
+ Tensor* tensor) override;
+
+ // The caller owns the returned device.
+ const PerOpGpuDevice* MakeGpuDevice(DeviceContext* dc,
+ Allocator* allocator) override;
+
+ protected:
+ Allocator* gpu_allocator_; // not owned
+ Allocator* cpu_allocator_; // not owned
+
+ private:
+ std::vector<gpu::Stream*> streams_;
+ std::vector<GPUDeviceContext*> device_contexts_;
+ GpuDeviceInfo* gpu_device_info_ = nullptr;
+ mutex trace_mu_;
+ int gpu_id_ = -1;
+ std::unique_ptr<EventMgr> em_;
+
+ const PerOpGpuDevice* NewDevice(int stream_id, Allocator* allocator);
+};
+
+class BaseGPUDeviceFactory : public DeviceFactory {
+ public:
+ void CreateDevices(const SessionOptions& options, const string& name_prefix,
+ std::vector<Device*>* devices) override;
+
+ private:
+ LocalDevice* CreateGPUDevice(const SessionOptions& options,
+ const string& name, int gpu_id);
+
+ virtual LocalDevice* CreateGPUDevice(const SessionOptions& options,
+ const string& name, Bytes memory_limit,
+ BusAdjacency bus_adjacency, int gpu_id,
+ const string& physical_device_desc,
+ Allocator* gpu_allocator,
+ Allocator* cpu_allocator) = 0;
+
+ void GetValidDeviceIds(std::vector<int>* ids);
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
new file mode 100644
index 0000000000..240ac47499
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -0,0 +1,52 @@
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
+
+namespace tensorflow {
+
+void RequireGPUDevice() {}
+
+class GPUDevice : public BaseGPUDevice {
+ public:
+ GPUDevice(const SessionOptions& options, const string& name,
+ Bytes memory_limit, BusAdjacency bus_adjacency, int gpu_id,
+ const string& physical_device_desc, Allocator* gpu_allocator,
+ Allocator* cpu_allocator)
+ : BaseGPUDevice(options, name, memory_limit, bus_adjacency, gpu_id,
+ physical_device_desc, gpu_allocator, cpu_allocator) {}
+
+ Allocator* GetAllocator(AllocatorAttributes attr) override {
+ if (attr.on_host()) {
+ ProcessState* ps = ProcessState::singleton();
+ if (attr.gpu_compatible()) {
+ return ps->GetCUDAHostAllocator(0);
+ } else {
+ return cpu_allocator_;
+ }
+ } else {
+ return gpu_allocator_;
+ }
+ }
+};
+
+class GPUDeviceFactory : public BaseGPUDeviceFactory {
+ private:
+ LocalDevice* CreateGPUDevice(const SessionOptions& options,
+ const string& name, Bytes memory_limit,
+ BusAdjacency bus_adjacency, int gpu_id,
+ const string& physical_device_desc,
+ Allocator* gpu_allocator,
+ Allocator* cpu_allocator) override {
+ return new GPUDevice(options, name, memory_limit, bus_adjacency, gpu_id,
+ physical_device_desc, gpu_allocator, cpu_allocator);
+ }
+};
+
+REGISTER_LOCAL_DEVICE_FACTORY("GPU", GPUDeviceFactory);
+
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
new file mode 100644
index 0000000000..29d6281733
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -0,0 +1,132 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/stream.h"
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+EventMgr::EventMgr(gpu::StreamExecutor* se)
+ : exec_(se),
+ // threadpool_ has 1 thread for the polling loop, and one to execute
+ // event callback functions. Maybe we should have more?
+ threadpool_(Env::Default(), "GPU_Event_Manager", 2) {
+ threadpool_.Schedule([this]() { PollLoop(); });
+}
+
+EventMgr::~EventMgr() {
+ stop_polling_.Notify();
+ // Shut down the backup polling loop.
+ polling_stopped_.WaitForNotification();
+
+ // Events are owned by this object.
+ for (auto& e : free_events_) {
+ delete e;
+ }
+ while (!used_events_.empty()) {
+ delete used_events_[0].event;
+ delete used_events_[0].mem;
+ if (used_events_[0].bufrec.buf) {
+ used_events_[0].bufrec.alloc->DeallocateRaw(used_events_[0].bufrec.buf);
+ }
+ if (used_events_[0].func != nullptr)
+ threadpool_.Schedule(used_events_[0].func);
+ used_events_.pop_front();
+ }
+}
+
+// This polling loop runs at a relatively low frequency. Most calls to
+// PollEvents() should come directly from Compute() via
+// ThenDeleteTensors(). This function's purpose is to ensure that
+// even if no more GPU operations are being requested, we still
+// eventually clear the queue. It seems to prevent some tensorflow
+// programs from stalling for reasons not yet understood.
+void EventMgr::PollLoop() {
+ while (!stop_polling_.HasBeenNotified()) {
+ Env::Default()->SleepForMicroseconds(1 * 1000);
+ {
+ mutex_lock l(mu_);
+ PollEvents(true);
+ }
+ }
+ polling_stopped_.Notify();
+}
+
+void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
+ VLOG(2) << "QueueInUse free_events_ " << free_events_.size()
+ << " used_events_ " << used_events_.size();
+ // Events are created on demand, and repeatedly reused. There is no
+ // limit placed here on the number of allocated Events.
+ if (free_events_.empty()) {
+ free_events_.push_back(new gpu::Event(exec_));
+ free_events_.back()->Init();
+ }
+ gpu::Event* e = free_events_.back();
+ free_events_.pop_back();
+ stream->ThenRecordEvent(e);
+ iu.event = e;
+ used_events_.push_back(iu);
+}
+
+// This function must be called periodically to check whether pending
+// events have recorded, and then retire them. Initial observations
+// suggest that typical behavior in a TensorFlow program is to have
+// 0-3 events pending most of the time, but there are occasionally
+// spikes of up to several hundred outstanding.
+//
+// NOTE: If all events are on the same stream, no later event will
+// complete before an earlier event, except possibly if the earlier
+// event transitions to an error state, so there's no advantage in
+// looking past the first kPending event. However, if we're using
+// multiple streams there may be some gain in looking deeper.
+// As a compromise, PollEvent() calls that are triggered by the queueing
+// of a single event never look past the first kPending event. Calls
+// coming from the dedicated polling thread always sweep the full queue.
+//
+// Note that allowing the queue to grow very long could cause overall
+// GPU memory use to spike needlessly. An alternative strategy would
+// be to throttle new Op execution until the pending event queue
+// clears.
+void EventMgr::PollEvents(bool is_dedicated_poller) {
+ VLOG(2) << "PollEvents free_events_ " << free_events_.size()
+ << " used_events_ " << used_events_.size();
+ // Sweep the remaining events in order. If this is the dedicated
+ // polling thread, check the entire set. Otherwise, just sweep up to
+ // the first non-complete record that is still pending.
+ for (auto& iu : used_events_) {
+ if (iu.event == nullptr) continue;
+ gpu::Event::Status s = iu.event->PollForStatus();
+ switch (s) {
+ case gpu::Event::Status::kUnknown:
+ case gpu::Event::Status::kError:
+ // We don't expect to see these. Someday maybe propagate
+ // a Status error, but for now fail hard.
+ LOG(FATAL) << "Unexpected Event status: " << static_cast<int>(s);
+ break;
+ case gpu::Event::Status::kPending:
+ if (!is_dedicated_poller) return; // quit processing queue
+ break;
+ case gpu::Event::Status::kComplete:
+ delete iu.mem;
+ if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
+ // The function must be called in another thread, outside of
+ // the mutex held here.
+ if (iu.func != nullptr) threadpool_.Schedule(iu.func);
+ free_events_.push_back(iu.event);
+ // Mark this InUse record as completed.
+ iu.event = nullptr;
+ }
+ }
+ // Then clear any completed InUse records from the front of the queue.
+ while (!used_events_.empty()) {
+ InUse& iu = used_events_.front();
+ if (iu.event == nullptr) {
+ used_events_.pop_front();
+ } else {
+ break;
+ }
+ }
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
new file mode 100644
index 0000000000..f9436566d4
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -0,0 +1,118 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
+
+#include <deque>
+#include <vector>
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/tensor.h"
+
+namespace perftools {
+namespace gputools {
+class Event;
+class Stream;
+class StreamExecutor;
+} // namespace gputools
+} // namespace perftools
+
+namespace tensorflow {
+
+// An object to keep track of pending Events in the StreamExecutor streams
+// and associated Tensors that cannot safely be deleted until the associated
+// Events are recorded.
+class EventMgr {
+ public:
+ explicit EventMgr(perftools::gputools::StreamExecutor* se);
+
+ ~EventMgr();
+
+ // Takes ownership of *tensors and deletes it as soon as all events
+ // currently enqueued on *stream have completed.
+ inline void ThenDeleteTensors(perftools::gputools::Stream* stream,
+ std::vector<Tensor>* tensors) {
+ mutex_lock l(mu_);
+ QueueTensors(stream, tensors);
+ PollEvents(false);
+ }
+
+ struct BufRec {
+ Allocator* alloc;
+ void* buf;
+ };
+
+ // Takes ownership of *bufrec.buf and calls bufrec.alloc->DeallocateRaw()
+ // on it as soon as all events currently enqueued on *stream have completed.
+ inline void ThenDeleteBuffer(perftools::gputools::Stream* stream,
+ BufRec bufrec) {
+ mutex_lock l(mu_);
+ QueueBuffer(stream, bufrec);
+ PollEvents(false);
+ }
+
+ inline void ThenExecute(perftools::gputools::Stream* stream,
+ std::function<void()> func) {
+ mutex_lock l(mu_);
+ QueueFunc(stream, func);
+ PollEvents(false);
+ }
+
+ private:
+ friend class TEST_EventMgrHelper;
+ mutex mu_;
+ perftools::gputools::StreamExecutor* exec_;
+
+ struct InUse {
+ perftools::gputools::Event* event;
+ std::vector<Tensor>* mem;
+ BufRec bufrec;
+ std::function<void()> func;
+ };
+
+ // Stream-enqueue an unused Event and save with it a collection of
+ // Tensors and/or a BufRec to be deleted only after the Event
+ // records.
+ void QueueInUse(perftools::gputools::Stream* stream, InUse in_use)
+ EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+ void QueueTensors(perftools::gputools::Stream* stream,
+ std::vector<Tensor>* tensors)
+ EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
+ }
+
+ void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec)
+ EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
+ }
+
+ void QueueFunc(perftools::gputools::Stream* stream,
+ std::function<void()> func) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+ QueueInUse(stream, {nullptr, nullptr, BufRec(), func});
+ }
+
+ // This function should be called at roughly the same tempo as
+ // QueueTensors() to check whether pending events have recorded,
+ // and then retire them.
+ void PollEvents(bool is_dedicated_poller) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+ // An internal polling loop that runs at a low frequency to clear
+ // straggler Events.
+ void PollLoop();
+
+ // A stack of unused events
+ std::vector<perftools::gputools::Event*> free_events_ GUARDED_BY(mu_);
+
+ // A FIFO queue of InUse events and associated tensors.
+ std::deque<InUse> used_events_ GUARDED_BY(mu_);
+
+ Notification stop_polling_;
+ Notification polling_stopped_;
+
+ // The main PollLoop for the event manager runs in this threadpool.
+ thread::ThreadPool threadpool_;
+};
+
+} // namespace tensorflow
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
new file mode 100644
index 0000000000..30ca1ff187
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -0,0 +1,152 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include <gtest/gtest.h>
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+class TEST_EventMgrHelper {
+ public:
+ explicit TEST_EventMgrHelper(EventMgr* em) : em_(em) {}
+
+ int queue_size() {
+ mutex_lock l(em_->mu_);
+ return em_->used_events_.size();
+ }
+
+ int free_size() {
+ mutex_lock l(em_->mu_);
+ return em_->free_events_.size();
+ }
+
+ void QueueTensors(perftools::gputools::Stream* stream,
+ std::vector<Tensor>* tensors) {
+ mutex_lock l(em_->mu_);
+ em_->QueueTensors(stream, tensors);
+ }
+
+ void PollEvents(bool is_dedicated_poller) {
+ mutex_lock l(em_->mu_);
+ em_->PollEvents(is_dedicated_poller);
+ }
+
+ private:
+ EventMgr* em_;
+};
+
+namespace {
+
+TEST(EventMgr, Empty) {
+ auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
+ EventMgr em(stream_exec);
+ TEST_EventMgrHelper th(&em);
+ EXPECT_EQ(0, th.queue_size());
+ EXPECT_EQ(0, th.free_size());
+}
+
+// Delaying polling until after several enqueings should grow the
+// total number of allocated events. Once we have enough events for
+// the max simultaneously pending, we should not allocate any more.
+TEST(EventMgr, DelayedPolling) {
+ auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
+ EventMgr em(stream_exec);
+ TEST_EventMgrHelper th(&em);
+ EXPECT_EQ(0, th.queue_size());
+ std::vector<Tensor>* v = nullptr;
+ std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+ CHECK(stream.get());
+ stream->Init();
+ for (int i = 0; i < 5; ++i) {
+ v = new std::vector<Tensor>;
+ th.QueueTensors(stream.get(), v);
+ EXPECT_EQ(i + 1, th.queue_size());
+ EXPECT_EQ(0, th.free_size());
+ }
+ th.PollEvents(false);
+ EXPECT_EQ(0, th.queue_size());
+ EXPECT_EQ(5, th.free_size());
+ for (int j = 0; j < 2; ++j) {
+ for (int i = 0; i < 5; ++i) {
+ v = new std::vector<Tensor>;
+ th.QueueTensors(stream.get(), v);
+ EXPECT_EQ(i + 1, th.queue_size());
+ EXPECT_EQ(4 - i, th.free_size());
+ }
+ th.PollEvents(false);
+ EXPECT_EQ(0, th.queue_size());
+ EXPECT_EQ(5, th.free_size());
+ }
+}
+
+// Immediate polling should require only one event to be allocated.
+TEST(EventMgr, ImmediatePolling) {
+ auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
+ EventMgr em(stream_exec);
+ TEST_EventMgrHelper th(&em);
+ EXPECT_EQ(0, th.queue_size());
+ EXPECT_EQ(0, th.free_size());
+ std::vector<Tensor>* v = nullptr;
+ std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+ CHECK(stream.get());
+ stream->Init();
+ for (int i = 0; i < 5; ++i) {
+ v = new std::vector<Tensor>;
+ em.ThenDeleteTensors(stream.get(), v);
+ EXPECT_EQ(0, th.queue_size());
+ EXPECT_EQ(1, th.free_size());
+ }
+}
+
+// If we delay polling by more than 1 second, the backup polling loop
+// should clear the queue.
+TEST(EventMgr, LongDelayedPolling) {
+ auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
+ EventMgr em(stream_exec);
+ TEST_EventMgrHelper th(&em);
+ EXPECT_EQ(0, th.queue_size());
+ EXPECT_EQ(0, th.free_size());
+ std::vector<Tensor>* v = nullptr;
+ std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+ CHECK(stream.get());
+ stream->Init();
+ for (int i = 0; i < 5; ++i) {
+ v = new std::vector<Tensor>;
+ th.QueueTensors(stream.get(), v);
+ EXPECT_EQ(1 + i, th.queue_size());
+ EXPECT_EQ(0, th.free_size());
+ }
+ sleep(1);
+ EXPECT_EQ(0, th.queue_size());
+ EXPECT_EQ(5, th.free_size());
+}
+
+// Deleting the EventMgr when events are still pending should shut
+// down gracefully.
+TEST(EventMgr, NonEmptyShutdown) {
+ auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
+ EventMgr em(stream_exec);
+ TEST_EventMgrHelper th(&em);
+ EXPECT_EQ(0, th.queue_size());
+ EXPECT_EQ(0, th.free_size());
+ std::vector<Tensor>* v = nullptr;
+ std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
+ CHECK(stream.get());
+ stream->Init();
+ for (int i = 0; i < 5; ++i) {
+ v = new std::vector<Tensor>;
+ th.QueueTensors(stream.get(), v);
+ EXPECT_EQ(1 + i, th.queue_size());
+ EXPECT_EQ(0, th.free_size());
+ }
+}
+
+} // namespace
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc
new file mode 100644
index 0000000000..631a47eb91
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.cc
@@ -0,0 +1,147 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+
+#include <string>
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+namespace {
+
+std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(
+ gpu::Platform* platform, int device_count) {
+ auto* map = new std::map<std::pair<int, int>, bool>;
+ for (int i = 0; i < device_count; ++i) {
+ for (int j = 0; j < device_count; ++j) {
+ gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie();
+ gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie();
+ (*map)[{i, j}] = from->CanEnablePeerAccessTo(to);
+ }
+ }
+
+ return std::unique_ptr<std::map<std::pair<int, int>, bool>>{map};
+}
+
+Status EnablePeerAccess(gpu::Platform* platform, int device_count) {
+ for (int i = 0; i < device_count; ++i) {
+ for (int j = 0; j < device_count; ++j) {
+ gpu::StreamExecutor* from = platform->ExecutorForDevice(i).ValueOrDie();
+ gpu::StreamExecutor* to = platform->ExecutorForDevice(j).ValueOrDie();
+
+ if (from->CanEnablePeerAccessTo(to)) {
+ auto status = from->EnablePeerAccessTo(to);
+ if (!status.ok()) {
+ return errors::Internal(status.ToString());
+ }
+ } else {
+ LOG(INFO) << "cannot enable peer access from device ordinal " << i
+ << " to device ordinal " << j;
+ }
+ }
+ }
+ return Status::OK();
+}
+
+static void InitGPU() {
+ auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
+ if (!result.ok()) {
+ LOG(WARNING)
+ << "Not initializing the GPU, could not create GPU MachineManager. "
+ << "Error: " << result.status();
+ return;
+ }
+
+ gpu::Platform* platform = result.ValueOrDie();
+
+ int dev_count = platform->VisibleDeviceCount();
+
+ if (dev_count == 0) {
+ LOG(INFO) << "No GPU devices available on machine.";
+ return;
+ }
+
+ for (int i = 0; i < dev_count; ++i) {
+ auto stream_exec = platform->ExecutorForDevice(i).ValueOrDie();
+ int64 free_bytes;
+ int64 total_bytes;
+ if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
+ // Logs internally on failure.
+ free_bytes = 0;
+ total_bytes = 0;
+ }
+ const auto& description = stream_exec->GetDeviceDescription();
+ int cc_major;
+ int cc_minor;
+ if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
+ // Logs internally on failure.
+ cc_major = 0;
+ cc_minor = 0;
+ }
+ LOG(INFO) << "Found device " << i << " with properties: "
+ << "\nname: " << description.name() << "\nmajor: " << cc_major
+ << " minor: " << cc_minor << " memoryClockRate (GHz) "
+ << description.clock_rate_ghz() << "\npciBusID "
+ << description.pci_bus_id() << "\nTotal memory: "
+ << strings::HumanReadableNumBytes(total_bytes)
+ << "\nFree memory: "
+ << strings::HumanReadableNumBytes(free_bytes);
+ }
+
+ // Enable peer access
+
+ auto status = EnablePeerAccess(platform, dev_count);
+ if (!status.ok()) {
+ LOG(FATAL) << "could not enable peer access for GPU devices: " << status;
+ }
+
+ // Print out a matrix showing which devices can DMA to one
+ // another.
+ auto access_map = GetPeerAccessMap(platform, dev_count);
+ string line_buf = "DMA: ";
+ for (int i = 0; i < dev_count; ++i) {
+ strings::StrAppend(&line_buf, i, " ");
+ }
+ LOG(INFO) << line_buf;
+ for (int i = 0; i < dev_count; ++i) {
+ line_buf = strings::StrCat(i, ": ");
+ for (int j = 0; j < dev_count; ++j) {
+ if ((*access_map)[{i, j}]) {
+ line_buf.append("Y ");
+ } else {
+ line_buf.append("N ");
+ }
+ }
+ LOG(INFO) << line_buf;
+ }
+}
+
+static bool InitModule() {
+ InitGPU();
+ return true;
+}
+
+} // namespace
+
+gpu::Platform* GPUMachineManager() {
+ // Create the machine manager singleton and initialize the GPUs only
+ // once.
+ static bool init = InitModule();
+ CHECK(init); // Avoids compiler warning that init is unused.
+
+ auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
+ if (!result.ok()) {
+ return nullptr;
+ }
+
+ return result.ValueOrDie();
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h
new file mode 100644
index 0000000000..d126a8b1ca
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.h
@@ -0,0 +1,19 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_
+
+namespace perftools {
+namespace gputools {
+class Platform;
+} // namespace gputools
+} // namespace perftools
+
+namespace tensorflow {
+
+// Returns the GPU machine manager singleton, creating it and
+// initializing the GPUs on the machine if needed the first time it is
+// called.
+perftools::gputools::Platform* GPUMachineManager();
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc
new file mode 100644
index 0000000000..08ff55e221
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.cc
@@ -0,0 +1,371 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h"
+
+//#include "base/commandlineflags.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+
+#if defined(PLATFORM_GOOGLE)
+DEFINE_bool(brain_gpu_region_allocator_heap_check_on_destruction, true,
+ "If true, the CUDA gpu manager checks that all allocated "
+ "memory through the GPU memory pool implementation has been "
+ "freed.");
+
+DEFINE_int64(brain_gpu_region_allocator_region_size, 0,
+ "If > 0, sets the default chunk-size allocatable from GPU memory. "
+ "Else defaults to entire GPU memory.");
+
+#else
+bool FLAGS_brain_gpu_region_allocator_heap_check_on_destruction = true;
+tensorflow::int64 FLAGS_brain_gpu_region_allocator_region_size = 0;
+#endif
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+GPURegionAllocator::GPURegionAllocator(int device_id, size_t total_bytes)
+ : device_id_(device_id), total_bytes_(total_bytes) {
+ // Get a pointer to the stream_executor for this device
+ stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+
+ // Set the region size based on explicit user request, or based on
+ // total GPU capacity.
+ if (FLAGS_brain_gpu_region_allocator_region_size > 0) {
+ region_size_ = FLAGS_brain_gpu_region_allocator_region_size;
+ } else {
+ region_size_ = static_cast<size_t>(total_bytes_);
+ }
+
+ LOG(INFO) << "Setting region size to " << region_size_;
+}
+
+GPURegionAllocator::~GPURegionAllocator() {
+ if (FLAGS_brain_gpu_region_allocator_heap_check_on_destruction) {
+ CheckForMemoryLeaks();
+ }
+
+ gtl::STLDeleteValues(&chunk_map_);
+
+ for (auto r : regions_) {
+ gpu::DeviceMemoryBase gpu_ptr{r->ptr};
+ stream_exec_->Deallocate(&gpu_ptr);
+ delete r;
+ }
+}
+
+void* GPURegionAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
+ static const int64 kMaxMillisToWait = 10000; // 10 seconds
+ return retry_helper_.AllocateRaw(
+ [this](size_t a, size_t nb, bool v) {
+ return AllocateRawInternal(a, nb, v);
+ },
+ kMaxMillisToWait, alignment, num_bytes);
+}
+
+void* GPURegionAllocator::AllocateRawInternal(size_t alignment,
+ size_t num_bytes,
+ bool dump_log_on_failure) {
+ if (num_bytes == 0) {
+ LOG(ERROR) << "tried to allocate 0 bytes";
+ return nullptr;
+ }
+ size_t chunk_size = ChunkSize(num_bytes);
+
+ VLOG(2) << "chunk_size " << chunk_size << " from num_bytes "
+ << strings::HumanReadableNumBytes(num_bytes);
+ mutex_lock l(lock_);
+ Pool* pool = &pools_[chunk_size];
+ if (pool->num_free == 0) {
+ if (!ExpandPool(pool, chunk_size, num_bytes, dump_log_on_failure)) {
+ if (dump_log_on_failure) {
+ LOG(WARNING) << "Out of GPU memory, see memory state dump above";
+ }
+ return nullptr;
+ }
+ }
+ CHECK_LT(0, pool->num_free);
+ CHECK(pool->first);
+ CHECK(pool->last);
+ Chunk* c = pool->first;
+ CHECK(c);
+ CHECK(!c->in_use);
+
+ c->in_use = true;
+ // Move c to the back of the queue.
+ if (c->next != nullptr) {
+ pool->first = c->next;
+ pool->first->prev = nullptr;
+ c->next = nullptr;
+ }
+
+ if (pool->last != c) {
+ pool->last->next = c;
+ c->prev = pool->last;
+ pool->last = c;
+ }
+ pool->num_free--;
+ pool->cumulative_malloced++;
+
+ void* rv = c->ptr;
+ c->bytes_allocated = num_bytes;
+
+ VLOG(2) << "new ptr " << rv;
+ return rv;
+}
+
+void GPURegionAllocator::DeallocateRaw(void* ptr) {
+ retry_helper_.DeallocateRaw([this](void* p) { DeallocateRawInternal(p); },
+ ptr);
+}
+
+void GPURegionAllocator::DeallocateRawInternal(void* ptr) {
+ VLOG(2) << "DeallocateRaw: " << ptr;
+ if (ptr == nullptr) {
+ LOG(ERROR) << "tried to deallocate nullptr";
+ return;
+ }
+
+ mutex_lock l(lock_);
+ ChunkMap::const_iterator iter = chunk_map_.find(ptr);
+ CHECK(iter != chunk_map_.end());
+
+ Chunk* c = iter->second;
+ VLOG(2) << "chunk of size " << c->size << " at " << c;
+
+ Pool* pool = &(pools_[c->size]);
+ // Move chunk to head of queue, and mark free.
+ DCHECK(c->in_use);
+ c->in_use = false;
+ if (c->prev) c->prev->next = c->next;
+ if (c->next) c->next->prev = c->prev;
+ if (pool->first == c) pool->first = c->next;
+ if (pool->last == c) pool->last = c->prev;
+ c->next = pool->first;
+ c->prev = nullptr;
+ if (c->next) c->next->prev = c;
+ pool->first = c;
+ if (pool->last == nullptr) pool->last = c;
+ pool->num_free++;
+ pool->cumulative_freed++;
+}
+
+bool GPURegionAllocator::ExpandPool(Pool* pool, size_t chunk_size,
+ size_t requested_size,
+ bool dump_log_on_failure) {
+ VLOG(1) << "ExpandPool of " << chunk_size << " from " << pool->num_chunks
+ << " current members";
+ DCHECK_NE(0, chunk_size);
+ // If chunk_size is < 4096, double the pool size. Otherwise
+ // just increase by one.
+ int num_chunks = pool->num_chunks;
+ if (num_chunks == 0) {
+ if (chunk_size > 4096) {
+ num_chunks = 1;
+ } else {
+ num_chunks = 4096 / chunk_size;
+ }
+ }
+ // For larger chunks, limit the amount of expansion.
+ size_t aggregate_size = num_chunks * chunk_size;
+ if (aggregate_size > (1 << 20)) {
+ num_chunks = static_cast<int>(
+ std::max(static_cast<size_t>(1), (1 << 20) / chunk_size));
+ }
+ while (num_chunks > 0) {
+ Region* r = (regions_.empty() ? nullptr : regions_.back());
+ if (r == nullptr ||
+ (((r->ptr + r->size) - r->next) < static_cast<int64>(chunk_size))) {
+ // Current region is not large enough to accommodate another chunk.
+ while (r == nullptr || (((r->ptr + r->size) - r->next) <
+ static_cast<int64>(chunk_size))) {
+ // Get another region.
+ size_t this_region_size = std::max(region_size_, chunk_size);
+
+ // Check if we would exceed our limit.
+ if (allocated_memory_ + this_region_size > total_bytes_) {
+ if (dump_log_on_failure) DumpMemoryLog();
+ return false;
+ }
+
+ // Perform the allocation, still checking that the allocator
+ // has not run out of memory.
+ gpu::DeviceMemory<char> gpu_mem =
+ stream_exec_->AllocateArray<char>(this_region_size);
+ if (gpu_mem == nullptr) {
+ if (dump_log_on_failure) DumpMemoryLog();
+ return false;
+ }
+
+ // We never release memory once expanded.
+ allocated_memory_ += this_region_size;
+
+ Region* nr = new Region;
+ nr->ptr = static_cast<char*>(gpu_mem.opaque());
+
+ if (VLOG_IS_ON(2)) {
+ int64 free_bytes;
+ int64 total_bytes;
+ if (stream_exec_->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
+ VLOG(2) << "free " << free_bytes << " total " << total_bytes;
+ } else {
+ // Note: stream_exec call also logs internally on failure.
+ VLOG(2) << "could not retrieve memory usage";
+ }
+ }
+ VLOG(1) << "new Region of size " << this_region_size << " at "
+ << static_cast<void*>(nr->ptr) << " on device " << device_id_;
+ r = nr;
+ r->size = this_region_size;
+ r->next = r->ptr;
+ regions_.push_back(r);
+
+ for (auto visitor : region_visitors_) {
+ visitor(r->ptr, r->size);
+ }
+ }
+ } else {
+ // Allocate a new chunk and push on front of Pool.
+ Chunk* c = new Chunk;
+ c->ptr = r->next;
+ chunk_map_[c->ptr] = c;
+ c->size = chunk_size;
+ r->next += chunk_size;
+ c->next = pool->first;
+ if (c->next != nullptr) c->next->prev = c;
+ pool->first = c;
+ if (pool->last == nullptr) pool->last = c;
+ pool->num_chunks++;
+ pool->num_free++;
+ --num_chunks;
+ }
+ }
+
+ return true;
+}
+
+void GPURegionAllocator::CheckForMemoryLeaks() {
+ std::vector<string> errors;
+ mutex_lock l(lock_); // could use reader lock
+ for (auto pool_map : pools_) {
+ const Pool& p = pool_map.second;
+ Chunk* curr_chunk = p.first;
+ while (curr_chunk != nullptr) {
+ if (curr_chunk->in_use) {
+ errors.push_back(
+ strings::StrCat("Unfreed chunk of size ", curr_chunk->size));
+ }
+ curr_chunk = curr_chunk->next;
+ }
+ }
+ if (!errors.empty()) {
+ LOG(FATAL) << "GPU Memory leaks:\n" << str_util::Join(errors, "\n");
+ }
+}
+
+// Since there's no merging of chunks once allocated, we want to
+// maximize their reusablity (which argues for fewer, larger sizes),
+// while minimizing waste (which argues for tight-fitting sizes).
+//
+// The smallest unit of allocation is 256 bytes.
+// NOTE(tucker): akrizhevsky says that nvidia's memory manager always
+// aligns to 256 bytes, and doing so results in significant speedup.
+//
+// Up to 2^16 bytes we only allocate in powers of 2.
+//
+// Above that, we pick a max-waste which is the largest power
+// of 2 <= 1/16 of the requested size, then round up to the nearest
+// multiple of max_waste.
+//
+// static
+size_t GPURegionAllocator::ChunkSize(size_t bytes) {
+ if (bytes <= 256) {
+ return 256;
+ } else if (bytes <= (1 << 16)) {
+ return 1uLL << Log2Ceiling64(bytes);
+ } else {
+ // 1/16th of requested size
+ size_t max_waste = 1uLL << (Log2Ceiling64(bytes) - 4);
+ return (bytes + max_waste) & (~(max_waste - 1));
+ }
+}
+
+void GPURegionAllocator::AddAllocVisitor(Visitor visitor) {
+ VLOG(1) << "AddVisitor";
+ mutex_lock l(lock_);
+ region_visitors_.push_back(visitor);
+ for (auto region : regions_) {
+ visitor(region->ptr, region->size);
+ }
+}
+
+void GPURegionAllocator::DumpMemoryLog() {
+ size_t region_bytes = 0;
+ for (auto r : regions_) {
+ region_bytes += r->size;
+ }
+ size_t chunk_bytes = 0;
+ std::vector<size_t> chunk_sizes;
+ for (auto i : pools_) {
+ chunk_sizes.push_back(i.first);
+ }
+ std::sort(chunk_sizes.begin(), chunk_sizes.end());
+ for (auto i : chunk_sizes) {
+ int32 chunks_in_use = 0;
+ const Pool& p = pools_[i];
+ chunk_bytes += i * p.num_chunks;
+
+ if (p.num_chunks > 0) {
+ // Iterate backwards (allocated chunks are last).
+ Chunk* curr_chunk = p.last;
+ while (curr_chunk != nullptr) {
+ if (curr_chunk->in_use) {
+ ++chunks_in_use;
+ }
+ curr_chunk = curr_chunk->prev;
+ if (curr_chunk == p.first) {
+ break;
+ }
+ }
+ }
+
+ LOG(INFO) << "Chunk size: " << i << " ("
+ << strings::HumanReadableNumBytes(i) << ") Pool: " << p.ToString()
+ << "\nNumber of chunks: " << p.num_chunks
+ << ", in_use chunks: " << chunks_in_use;
+ }
+
+ LOG(INFO) << "Aggregate Region Memory: " << region_bytes << " ("
+ << strings::HumanReadableNumBytes(region_bytes) << ")";
+ LOG(INFO) << "Aggregate Chunk Memory: " << chunk_bytes << " ("
+ << strings::HumanReadableNumBytes(chunk_bytes) << ")";
+}
+
+bool GPURegionAllocator::TracksAllocationSizes() { return true; }
+
+size_t GPURegionAllocator::RequestedSize(void* ptr) {
+ mutex_lock l(lock_);
+ auto it = chunk_map_.find(ptr);
+ CHECK(it != chunk_map_.end())
+ << "Asked for requested size of pointer we never allocated: " << ptr;
+ auto c = it->second;
+ return c->bytes_allocated;
+}
+
+size_t GPURegionAllocator::AllocatedSize(void* ptr) {
+ mutex_lock l(lock_);
+ auto it = chunk_map_.find(ptr);
+ CHECK(it != chunk_map_.end())
+ << "Asked for allocated size of pointer we never allocated: " << ptr;
+ auto c = it->second;
+ return c->size;
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_region_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.h
new file mode 100644
index 0000000000..1a250b6ede
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_region_allocator.h
@@ -0,0 +1,146 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_REGION_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_REGION_ALLOCATOR_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
+#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+class GPURegionAllocator : public VisitableAllocator {
+ public:
+ // 'device_id' must be a valid device on the machine.
+ //
+ // total_bytes is how many bytes this allocator should allocate up
+ // to. This may be less than the total available.
+ explicit GPURegionAllocator(int device_id, size_t total_bytes);
+ ~GPURegionAllocator() override;
+
+ string Name() override { return "gpu_region"; }
+ void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+ void DeallocateRaw(void* ptr) override;
+ void AddAllocVisitor(Visitor visitor) override;
+ // Does nothing, because regions are never freed.
+ void AddFreeVisitor(Visitor visitor) override {}
+
+ bool TracksAllocationSizes() override;
+ size_t RequestedSize(void* ptr) override;
+ size_t AllocatedSize(void* ptr) override;
+
+ private:
+ // A Chunk is the header on a single piece of memory given back
+ // in response to an AllocateRaw() call.
+ struct Chunk {
+ char* ptr; // pointer to granted GPU buffer.
+ size_t size; // Full size of GPU buffer.
+ size_t bytes_allocated; // Bytes asked for by client.
+ bool in_use;
+ Chunk* prev; // Used for chaining in pool.
+ Chunk* next;
+ Chunk()
+ : ptr(nullptr),
+ size(0),
+ bytes_allocated(0),
+ in_use(false),
+ prev(nullptr),
+ next(nullptr) {}
+ };
+
+ // A Pool is a collection of same-sized Chunks.
+ struct Pool {
+ int num_chunks; // total chunks in this pool
+ int num_free; // total free chunks in this pool
+ int64 cumulative_malloced; // number of chunks malloced so far
+ int64 cumulative_freed; // number of chunks freed so far
+
+ // double-linked ring of chunks; all free chunks precede all
+ // granted chunks
+ Chunk* first;
+ Chunk* last;
+ Pool()
+ : num_chunks(0),
+ num_free(0),
+ cumulative_malloced(0),
+ cumulative_freed(0),
+ first(nullptr),
+ last(nullptr) {}
+
+ string ToString() const {
+ return strings::StrCat("chunks: ", num_chunks, " free: ", num_free,
+ " cumulative malloc: ", cumulative_malloced,
+ " cumulative freed: ", cumulative_freed);
+ }
+ };
+
+ // A Region is a single area of GPU memory that has been
+ // reserved by this class and carved up into Chunks.
+ struct Region {
+ char* ptr; // base GPU ptr
+ char* next; // frontier of unused part of region
+ size_t size;
+ Region() : ptr(nullptr), size(0) {}
+ };
+
+ // Calculate size of chunk for an allocation of this size.
+ // Min chunk size is 16, for alignment.
+ // For larger sizes, we round up somewhat so there are fewer
+ // size-specific pools.
+ static size_t ChunkSize(size_t bytes);
+
+ void* AllocateRawInternal(size_t alignment, size_t num_bytes,
+ bool dump_log_on_failure);
+ void DeallocateRawInternal(void* ptr);
+
+ bool ExpandPool(Pool* p, size_t chunk_size, size_t requested_size,
+ bool dump_log_on_failure) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+ // Inspects region maps and crashes with debug information if there
+ // are any memory leaks as detected by the region allocator.
+ void CheckForMemoryLeaks() LOCKS_EXCLUDED(lock_);
+
+ void DumpMemoryLog() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+ perftools::gputools::StreamExecutor* stream_exec_; // Not owned.
+
+ typedef std::unordered_map<size_t, Pool> PoolMap;
+ typedef std::unordered_map<void*, Chunk*> ChunkMap;
+
+ GPUAllocatorRetry retry_helper_;
+ mutable mutex lock_;
+ PoolMap pools_ GUARDED_BY(lock_);
+
+ // Owns regions.
+ std::vector<Region*> regions_ GUARDED_BY(lock_);
+
+ // Maps from GPU ptr to Chunk owning it.
+ //
+ // Owns chunks.
+ ChunkMap chunk_map_ GUARDED_BY(lock_);
+
+ // Called once on each region, ASAP.
+ std::vector<Visitor> region_visitors_ GUARDED_BY(lock_);
+
+ const int device_id_;
+
+ // Total amount of memory (in bytes) available to this Allocator
+ const size_t total_bytes_;
+
+ // Total amount of memory allocated to regions.
+ size_t allocated_memory_ = 0;
+
+ size_t region_size_ = 0;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(GPURegionAllocator);
+};
+
+} // namespace tensorflow
+
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_REGION_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_region_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_region_allocator_test.cc
new file mode 100644
index 0000000000..07b0dd57f6
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_region_allocator_test.cc
@@ -0,0 +1,71 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+#include <gtest/gtest.h>
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+namespace {
+
+TEST(GPURegionAllocatorTest, Simple) {
+ GPURegionAllocator a(0, 1 << 26);
+ std::vector<void*> ptrs;
+ for (int s = 1; s < 1024; s++) {
+ void* raw = a.AllocateRaw(1, s);
+ ptrs.push_back(raw);
+ }
+ std::sort(ptrs.begin(), ptrs.end());
+ for (int i = 0; i < ptrs.size(); i++) {
+ if (i > 0) {
+ CHECK_NE(ptrs[i], ptrs[i - 1]); // No dups
+ }
+ a.DeallocateRaw(ptrs[i]);
+ }
+ float* t1 = a.Allocate<float>(1024);
+ double* t2 = a.Allocate<double>(1048576);
+ a.Deallocate(t1);
+ a.Deallocate(t2);
+}
+
+TEST(GPURegionAllocatorTest, CheckMemLeak) {
+ EXPECT_DEATH(
+ {
+ GPURegionAllocator a(0, 1 << 26);
+ float* t1 = a.Allocate<float>(1024);
+ if (t1) {
+ LOG(INFO) << "Not deallocating";
+ }
+ },
+ "");
+}
+
+TEST(GPURegionAllocatorTest, TracksSizes) {
+ GPURegionAllocator a(0, 1 << 26);
+ EXPECT_EQ(true, a.TracksAllocationSizes());
+}
+
+TEST(GPURegionAllocatorTest, AllocatedVsRequested) {
+ GPURegionAllocator a(0, 1 << 26);
+ float* t1 = a.Allocate<float>(1);
+ EXPECT_EQ(sizeof(float), a.RequestedSize(t1));
+
+ // Minimum allocation size if 256
+ EXPECT_EQ(256, a.AllocatedSize(t1));
+
+ a.Deallocate(t1);
+}
+
+} // namespace
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
new file mode 100644
index 0000000000..ca86c7fa06
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
@@ -0,0 +1,97 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
+
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace gpu_stream_util {
+
+Status AssignStreams(const Graph* graph, const AssignStreamsOpts& opts,
+ std::unordered_map<int, int>* node_to_stream_id) {
+ VLOG(1) << "AssignStreams";
+ Status status;
+
+ // Sanity check arguments.
+ if (graph == nullptr)
+ status.Update(errors::InvalidArgument("Bad graph argument supplied."));
+ if (node_to_stream_id == nullptr) {
+ status.Update(
+ errors::InvalidArgument("Bad node_to_stream_id argument supplied."));
+ }
+ if ((opts.max_streams < 1) || (opts.send_stream >= opts.max_streams) ||
+ (opts.recv_stream >= opts.max_streams) ||
+ (opts.const_stream >= opts.max_streams) ||
+ (opts.compute_stream >= opts.max_streams)) {
+ status.Update(errors::InvalidArgument("Bad graph argument supplied."));
+ }
+ TF_RETURN_IF_ERROR(status);
+
+ // Topologically sort the nodes.
+ std::vector<Node*> order;
+ GetReversePostOrder(*graph, &order);
+ if (VLOG_IS_ON(2)) {
+ for (Node* n : order) {
+ const int node_id = n->id();
+ VLOG(2) << "Node " << node_id << " " << n->type_string() << " "
+ << n->name() << " " << n->in_edges().size() << " inputs";
+ for (const Edge* e : n->in_edges()) {
+ VLOG(2) << " Edge from " << e->src()->id() << " " << e->src()->name()
+ << " fanout " << e->src()->out_edges().size();
+ }
+ }
+ }
+ // We perform stream assigmnent assuming a large number of
+ // stream IDs and then map these down to the required number of streams
+ // using simple round-robin.
+ // Stream Assignment strategy:
+ // 1. Nodes with zero inputs are always be executed on a
+ // fresh stream.
+ // 2. Try to execute a node on the same stream as one of its
+ // inputs to avoid inter-stream dependencies.
+ // 3. If any input comes from a node with a large fanout then
+ // perhaps an indication that it is shared between parallel
+ // streams of work. We choose a new stream here so that all consumers
+ // of the tensor are likely to run in parallel.
+ int highest_stream_id = -1;
+ for (Node* n : order) {
+ VLOG(3) << "Inspecting node " << n->DebugString();
+ const int node_id = n->id();
+ const string& op = n->type_string();
+
+ // Determine a suitable stream to use.
+ int stream_id = highest_stream_id + 1;
+ for (const Edge* e : n->in_edges()) {
+ const int fanout = e->src()->out_edges().size();
+ if (fanout == 1) {
+ stream_id = (*node_to_stream_id)[e->src()->id()];
+ break;
+ }
+ }
+ // Override stream for specific op types.
+ if (op == "_Send") {
+ if (opts.send_stream >= 0) stream_id = opts.send_stream;
+ } else if (op == "_Recv") {
+ if (opts.recv_stream >= 0) stream_id = opts.recv_stream;
+ } else if (op == "Const") {
+ if (opts.const_stream >= 0) stream_id = opts.const_stream;
+ } else {
+ if (opts.compute_stream >= 0) stream_id = opts.compute_stream;
+ }
+
+ (*node_to_stream_id)[node_id] = stream_id % opts.max_streams;
+ highest_stream_id = std::max(stream_id, highest_stream_id);
+ }
+ VLOG(1) << "Identified " << highest_stream_id << " candidate streams for "
+ << order.size() << " nodes.";
+
+ return Status::OK();
+}
+
+} // namespace gpu_stream_util
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util.h b/tensorflow/core/common_runtime/gpu/gpu_stream_util.h
new file mode 100644
index 0000000000..e1c623382c
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util.h
@@ -0,0 +1,30 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/public/status.h"
+
+namespace tensorflow {
+namespace gpu_stream_util {
+
+struct AssignStreamsOpts {
+ int32 max_streams = 1;
+ // The following options specify a stream to use for specific op
+ // types. The value -1 allows ops to be assigned to any stream.
+ int32 send_stream = -1;
+ int32 recv_stream = -1;
+ int32 const_stream = -1;
+ int32 compute_stream = -1;
+};
+
+// Given the input graph, assigns every node in the graph with a
+// stream_id that should be used.
+Status AssignStreams(const Graph* graph, const AssignStreamsOpts& opts,
+ std::unordered_map<int, int>* node_to_stream_id);
+
+} // namespace gpu_stream_util
+} // namespace tensorflow
+
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
new file mode 100644
index 0000000000..5c426caaef
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
@@ -0,0 +1,137 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+class GpuStreamUtilTest : public OpsTestBase {
+ protected:
+ void SetUp() override { RequireDefaultOps(); }
+};
+
+TEST_F(GpuStreamUtilTest, BogusOpts) {
+ GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+ Graph g(OpRegistry::Global());
+ ASSERT_OK(b.ToGraph(&g));
+ std::unordered_map<int, int> node_to_stream_id;
+ gpu_stream_util::AssignStreamsOpts opts;
+ Status status;
+ status = gpu_stream_util::AssignStreams(nullptr, opts, &node_to_stream_id);
+ EXPECT_FALSE(status.ok());
+ status = gpu_stream_util::AssignStreams(&g, opts, nullptr);
+ EXPECT_FALSE(status.ok());
+ opts.max_streams = 0;
+ status = gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id);
+ EXPECT_FALSE(status.ok());
+ opts.max_streams = 1;
+ opts.compute_stream = 5;
+ status = gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id);
+ EXPECT_FALSE(status.ok());
+}
+
+TEST_F(GpuStreamUtilTest, EmptyGraph) {
+ GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+ Graph g(OpRegistry::Global());
+ ASSERT_OK(b.ToGraph(&g));
+ std::unordered_map<int, int> node_to_stream_id;
+ gpu_stream_util::AssignStreamsOpts opts;
+ ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
+ EXPECT_EQ(2, node_to_stream_id.size()); // _SOURCE and _SINK
+}
+
+TEST_F(GpuStreamUtilTest, SimpleGraphOneStream) {
+ GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+ ops::MatMul(ops::Const(Tensor(DT_FLOAT), b.opts()),
+ ops::Const(Tensor(DT_FLOAT), b.opts()), b.opts());
+ Graph g(OpRegistry::Global());
+ ASSERT_OK(b.ToGraph(&g));
+
+ std::unordered_map<int, int> node_to_stream_id;
+ gpu_stream_util::AssignStreamsOpts opts;
+ ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
+
+ // There should be 5 nodes assigned.
+ EXPECT_EQ(5, node_to_stream_id.size());
+
+ // All of them should have stream 0.
+ for (const auto& it : node_to_stream_id) {
+ EXPECT_EQ(0, it.second);
+ }
+}
+
+TEST_F(GpuStreamUtilTest, SimpleGraphManyStreams) {
+ GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+ ops::MatMul(ops::Const(Tensor(DT_FLOAT), b.opts()),
+ ops::Const(Tensor(DT_FLOAT), b.opts()), b.opts());
+ Graph g(OpRegistry::Global());
+ ASSERT_OK(b.ToGraph(&g));
+
+ std::unordered_map<int, int> node_to_stream_id;
+ gpu_stream_util::AssignStreamsOpts opts;
+ opts.max_streams = 3;
+ ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
+
+ // There should be 5 nodes assigned.
+ EXPECT_EQ(5, node_to_stream_id.size());
+
+ // All of them should have a stream in the range [0..max_streams).
+ for (const auto& it : node_to_stream_id) {
+ EXPECT_GE(it.second, 0);
+ EXPECT_LT(it.second, opts.max_streams);
+ }
+}
+
+TEST_F(GpuStreamUtilTest, StreamOverrides) {
+ GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+ ops::_Recv(DT_FLOAT, "input", "/cpu:0", 0, "/gpu:0",
+ b.opts().WithName("input"));
+ auto n = ops::MatMul(ops::Const(Tensor(DT_FLOAT), b.opts()),
+ ops::Const(Tensor(DT_FLOAT), b.opts()), b.opts());
+ ops::_Send(n, "output", "/gpu:0", 0, "/cpu:0", b.opts().WithName("output"));
+ Graph g(OpRegistry::Global());
+ ASSERT_OK(b.ToGraph(&g));
+
+ // Perform stream assignment using a large number of streams, but with
+ // op types constrained to specific streams.
+ std::unordered_map<int, int> node_to_stream_id;
+ gpu_stream_util::AssignStreamsOpts opts;
+ opts.max_streams = 100;
+ opts.const_stream = 90;
+ opts.send_stream = 91;
+ opts.recv_stream = 92;
+ opts.compute_stream = 93;
+ ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
+
+ // There should be 7 nodes assigned.
+ EXPECT_EQ(7, node_to_stream_id.size()); // including _SOURCE and _SINK
+
+ // Nodes should be assigned to streams by op type.
+ for (const auto& it : node_to_stream_id) {
+ Node* n = g.FindNodeId(it.first);
+ const string op = n->type_string();
+ const int stream = it.second;
+ if (op == "Const") {
+ EXPECT_EQ(stream, 90);
+ } else if (op == "_Send") {
+ EXPECT_EQ(stream, 91);
+ } else if (op == "_Recv") {
+ EXPECT_EQ(stream, 92);
+ } else { // Compute.
+ EXPECT_EQ(stream, 93);
+ }
+ }
+}
+
+} // namespace
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
new file mode 100644
index 0000000000..a6a3ce01fc
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -0,0 +1,345 @@
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+
+//#include "base/commandlineflags.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/tensor_coding.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/gpu/dma_helper.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
+#include "tensorflow/core/util/util.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+#include "tensorflow/core/platform/stream_executor_util.h"
+
+#if defined(PLATFORM_GOOGLE)
+DEFINE_int64(brain_gpu_util_debug_string_maxlen, 128,
+ "When dumping gpu memory, prints up to this many bytes.");
+
+DECLARE_bool(record_mem_types);
+#else
+tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128;
+bool FLAGS_EXPERIMENTAL_brain_gpu_multi_stream = false;
+extern bool FLAGS_record_mem_types;
+#endif
+
+using perftools::gputools::DeviceMemoryBase;
+using perftools::gputools::DeviceMemory;
+using perftools::gputools::Stream;
+
+namespace tensorflow {
+
+namespace gpu = ::perftools::gputools;
+
+/*static*/
+void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
+ const DeviceContext* device_context,
+ TensorProto* proto, bool is_dead,
+ StatusCallback done) {
+ VLOG(1) << "SetProtoFromGPU device_context " << device_context;
+ // Tensor values need to be copied from GPU to CPU ram so that
+ // we can build the protobuf response for a RecvTensor RPC.
+ // "device context" identifies the stream where the _Send op executed.
+ CHECK(device_context);
+ gpu::Stream* stream =
+ static_cast<const GPUDeviceContext*>(device_context)->stream();
+
+ if (!DMAHelper::CanUseDMA(&tensor)) {
+ done(errors::Internal(strings::StrCat(
+ "GPU copy from non-DMA ", DataTypeString(tensor.dtype()), "tensor")));
+ return;
+ }
+ proto->set_dtype(tensor.dtype());
+ tensor.shape().AsProto(proto->mutable_tensor_shape());
+ // Prepare a Cord with the right data buf size, and DMA the
+ // data over from the GPU buffer. Note that 0-size tensors
+ // do not have a backing buffer.
+ const size_t num_bytes = is_dead ? 0 : tensor.TotalBytes();
+ if (num_bytes > 0) {
+ port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU");
+ Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+ char* mb = alloc->Allocate<char>(num_bytes);
+ const char* src_ptr =
+ reinterpret_cast<const char*>(DMAHelper::base(&tensor));
+ DeviceMemoryBase gpu_src_ptr(const_cast<char*>(src_ptr), num_bytes);
+ stream->ThenMemcpy(mb, gpu_src_ptr, num_bytes);
+ // Use of tensor may outlive stack scope, so keep a ref.
+ Tensor* tensor_ref = new Tensor(tensor);
+ dev->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
+ stream, [stream, done, proto, mb, num_bytes, alloc, tensor_ref]() {
+ if (!stream->ok()) {
+ done(errors::Internal("SetProtoFromGPU: GPU Memcpy failed"));
+ // TODO(pbar) We currently have no way to recover the
+ // worker from a GPU stream in the error state. Until
+ // there is a way to reset the CUDA driver, it is
+ // preferable to crash the process and restart. Tracked
+ // under b/23717097
+ LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed";
+ return;
+ }
+ delete tensor_ref;
+ port::CopyFromArray(proto->mutable_tensor_content(), mb, num_bytes);
+ alloc->Deallocate<char>(mb);
+ done(Status::OK());
+ });
+ } else {
+ done(Status::OK());
+ }
+}
+
+typedef ProcessState::MemDesc PMD;
+
+/*static*/
+void GPUUtil::CopyViaDMA(const string& edge_name,
+ DeviceContext* send_dev_context,
+ DeviceContext* recv_dev_context, Device* src,
+ Device* dst, AllocatorAttributes src_alloc_attr,
+ AllocatorAttributes dst_alloc_attr,
+ const Tensor* input, Tensor* output,
+ StatusCallback done) {
+ port::Tracing::ScopedAnnotation annotation(edge_name);
+ VLOG(1) << "CopyViaDMA " << edge_name;
+ size_t total_bytes = input->TotalBytes();
+ // Note that 0-size tensors have no backing buffer.
+ if (total_bytes > 0) {
+ const void* src_ptr = DMAHelper::base(input);
+ void* dst_ptr = DMAHelper::base(output);
+ VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
+ if (FLAGS_record_mem_types) {
+ ProcessState::MemDesc smd = ProcessState::singleton()->PtrType(src_ptr);
+ ProcessState::MemDesc dmd = ProcessState::singleton()->PtrType(dst_ptr);
+ VLOG(0) << "Src " << smd.DebugString() << " Dst " << dmd.DebugString();
+ if (smd.loc == PMD::CPU && dmd.loc == PMD::GPU && (!smd.gpu_registered)) {
+ LOG(WARNING) << "CPU -> GPU no reg for " << edge_name;
+ }
+ if (dmd.loc == PMD::CPU && smd.loc == PMD::GPU && (!dmd.gpu_registered)) {
+ LOG(WARNING) << "GPU -> CPU no reg for " << edge_name;
+ }
+ }
+
+ auto src_device_type = src->attributes().device_type();
+ auto dst_device_type = dst->attributes().device_type();
+
+ bool non_cpu_src = (!src_alloc_attr.on_host() &&
+ src_device_type != DeviceType(DEVICE_CPU).type());
+ bool non_cpu_dst = (!dst_alloc_attr.on_host() &&
+ dst_device_type != DeviceType(DEVICE_CPU).type());
+ if (non_cpu_src) {
+ gpu::Stream* stream = send_dev_context->stream();
+ if (stream == nullptr) {
+ done(errors::Internal("Failed to find device stream"));
+ return;
+ }
+ auto* src_dev_info = src->tensorflow_gpu_device_info();
+ CHECK(src_dev_info);
+
+ if (non_cpu_dst) {
+ // Device to device copy
+ DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
+ stream->ThenMemcpy(
+ &gpu_dst_ptr,
+ DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes},
+ total_bytes);
+ if (dst_device_type == DeviceType(DEVICE_GPU).type()) {
+ // Use of input may outlive stack scope, so keep a ref.
+ Tensor* input_ref = new Tensor(*input);
+ src_dev_info->event_mgr->ThenExecute(
+ stream, [done, stream, input_ref]() {
+ delete input_ref;
+ if (!stream->ok()) {
+ done(errors::Internal("GPU->GPU Memcpy failed"));
+ } else {
+ done(Status::OK());
+ }
+ });
+ }
+ send_dev_context->MaintainLifetimeOnStream(input, stream);
+ } else {
+ // Device to host copy.
+ return send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src,
+ output, done);
+ }
+ } else if (non_cpu_dst) {
+ // Host to Device copy.
+ // Note that this is already an async copy.
+ recv_dev_context->CopyCPUTensorToDevice(input, dst, output, done);
+ } else {
+ memcpy(dst_ptr, src_ptr, total_bytes);
+ done(Status::OK());
+ }
+ } else {
+ // buffer is empty
+ done(Status::OK());
+ }
+}
+
+void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
+ const DeviceContext* device_context,
+ const Tensor* gpu_tensor, Tensor* cpu_tensor,
+ StatusCallback done) {
+ VLOG(1) << "CopyGPUTensorToCPU";
+ size_t total_bytes = gpu_tensor->TotalBytes();
+ // Note that 0-size tensors have no backing buffer.
+ if (total_bytes > 0) {
+ const void* src_ptr = DMAHelper::base(gpu_tensor);
+ void* dst_ptr = DMAHelper::base(cpu_tensor);
+ CHECK(dst_ptr);
+ auto* stream = gpu_device->tensorflow_gpu_device_info()->stream;
+ if (device_context) {
+ stream = static_cast<const GPUDeviceContext*>(device_context)->stream();
+ }
+ stream->ThenMemcpy(
+ dst_ptr, DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes},
+ total_bytes);
+ stream->BlockHostUntilDone();
+ if (!stream->ok()) {
+ done(errors::Internal("CopyGPUTensorToCPU: GPU->CPU Memcpy failed"));
+ return;
+ }
+ }
+
+ done(Status::OK());
+}
+
+/* static */
+void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
+ const DeviceContext* device_context,
+ Device* gpu_device, Tensor* gpu_tensor,
+ StatusCallback done) {
+ VLOG(1) << "CopyCPUTensorToGPU";
+ CHECK(DeviceType(gpu_device->attributes().device_type()) ==
+ DeviceType(DEVICE_GPU));
+
+ auto* dev_info = gpu_device->tensorflow_gpu_device_info();
+ if (!dev_info) {
+ done(errors::Internal("Failed to find dest device GPUDeviceInfo"));
+ return;
+ }
+ if (cpu_tensor->TotalBytes() != gpu_tensor->TotalBytes()) {
+ done(errors::Internal(
+ strings::StrCat("Can't copy ", cpu_tensor->TotalBytes(),
+ " bytes of a tensor into another with ",
+ gpu_tensor->TotalBytes(), " bytes buffer.")));
+ return;
+ }
+ const int64 total_bytes = cpu_tensor->TotalBytes();
+ // Note that 0-size tensors have no backing buffer.
+ if (total_bytes > 0) {
+ const void* src_ptr = DMAHelper::base(cpu_tensor);
+ void* dst_ptr = DMAHelper::base(gpu_tensor);
+ DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
+
+ CHECK(device_context);
+ auto* stream =
+ static_cast<const GPUDeviceContext*>(device_context)->stream();
+ stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
+ auto* dev_info = gpu_device->tensorflow_gpu_device_info();
+ // Use of cpu_tensor may outlive stack scope, so keep a ref.
+ Tensor* input_ref = new Tensor(*cpu_tensor);
+ dev_info->event_mgr->ThenExecute(stream, [stream, done, input_ref]() {
+ delete input_ref;
+ if (!stream->ok()) {
+ done(errors::Internal("CopyCPUTensorToGPU: GPU Memcpy failed"));
+ } else {
+ done(Status::OK());
+ }
+ });
+ } else {
+ // empty tensor case
+ done(Status::OK());
+ }
+}
+
+Status GPUUtil::Sync(Device* gpu_device) {
+ VLOG(1) << "GPUUtil::Sync";
+ auto* dev_info = gpu_device->tensorflow_gpu_device_info();
+ if (!dev_info) {
+ return errors::Internal("Failed to find dest device GPUDeviceInfo");
+ }
+ dev_info->stream->BlockHostUntilDone();
+ if (!dev_info->stream->ok()) {
+ LOG(FATAL) << "GPU sync failed";
+ }
+ return Status::OK();
+}
+
+Status GPUUtil::SyncAll(Device* gpu_device) {
+ VLOG(1) << "GPUUtil::SyncAll";
+ auto* dev_info = gpu_device->tensorflow_gpu_device_info();
+ if (!dev_info) {
+ return errors::Internal("Failed to find dest device GPUDeviceInfo");
+ }
+ if (!dev_info->stream->parent()->SynchronizeAllActivity() ||
+ !dev_info->stream->ok()) {
+ LOG(FATAL) << "GPU sync failed";
+ }
+ return Status::OK();
+}
+
+string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
+ string ret;
+ CHECK(tensor);
+ const int64 num_bytes = std::min<int64>(
+ FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes());
+ void* ptr = (num_bytes > 0) ? DMAHelper::base(tensor) : nullptr;
+ strings::Appendf(&ret, "%p:", ptr);
+ if (num_bytes > 0) {
+ auto* dev_info = device->tensorflow_gpu_device_info();
+ if (!dev_info) {
+ strings::StrAppend(
+ &ret, PrintMemory(reinterpret_cast<const char*>(ptr), num_bytes));
+ } else {
+ string buf;
+ buf.resize(num_bytes);
+ DeviceMemoryBase gpu_ptr(ptr, num_bytes);
+ Status s = dev_info->stream->parent()->SynchronousMemcpyD2H(
+ gpu_ptr, num_bytes, gtl::string_as_array(&buf));
+ strings::StrAppend(&ret,
+ PrintMemory(gtl::string_as_array(&buf), num_bytes));
+ }
+ }
+ return ret;
+}
+
+// TODO(pbar) Checksum is called from places without a valid device context.
+uint64 GPUUtil::Checksum(Device* gpu_device,
+ const DeviceContext* device_context,
+ const Tensor& tensor) {
+ Tensor copy(tensor.dtype(), tensor.shape());
+ Status s;
+ Notification n;
+ CopyGPUTensorToCPU(gpu_device, device_context, &tensor, &copy,
+ [&s, &n](Status status) {
+ s.Update(status);
+ n.Notify();
+ });
+ n.WaitForNotification();
+ CHECK(s.ok()) << s;
+ return Checksum(copy);
+}
+
+uint64 GPUUtil::Checksum(const Tensor& tensor) {
+ const float* fptr = reinterpret_cast<const float*>(DMAHelper::base(&tensor));
+ size_t num_bytes = tensor.TotalBytes();
+ size_t num_floats = num_bytes / sizeof(float);
+ for (size_t i = 0; i < num_floats; ++i) {
+ CHECK(!std::isnan(fptr[i])) << " i " << i;
+ }
+ // TODO(tucker): consider using crc32c instead.
+ return Hash64(reinterpret_cast<const char*>(DMAHelper::base(&tensor)),
+ tensor.TotalBytes(), 0);
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
new file mode 100644
index 0000000000..1d8c3a054d
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -0,0 +1,89 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/status.h"
+#include "tensorflow/core/common_runtime/gpu/dma_helper.h"
+#include "tensorflow/stream_executor/device_memory.h"
+
+#include "tensorflow/stream_executor/stream.h"
+
+namespace tensorflow {
+
+class RecvTensorResponse;
+class TensorProto;
+
+namespace gpu = ::perftools::gputools;
+
+class GPUUtil {
+ public:
+ // "tensor" is GPU-local. "dev" is the hosting GPU.
+ // "device_context" should be the context of the GPU "_Send" op
+ // which provides the Tensor.
+ // Sets all necessasry fields of "proto" by transferring value
+ // bytes from GPU to CPU RAM. "is_dead" indicates that the
+ // tensor is dead with an uninit value.
+ static void SetProtoFromGPU(const Tensor& tensor, Device* dev,
+ const DeviceContext* device_context,
+ TensorProto* proto, bool is_dead,
+ StatusCallback done);
+
+ // Copies "input" to "output" between devices accessible to the
+ // local process via some DMA-like method. "edge_name" is the name
+ // of the tensor being copied, for debugging purposes. Depending on
+ // the type of devices and memory in use, the copy may be performed
+ // synchronously or asynchronously. 'done' will be invoked only
+ // after the copy is actually complete.
+ static void CopyViaDMA(const string& edge_name,
+ DeviceContext* send_dev_context,
+ DeviceContext* recv_dev_context, Device* src,
+ Device* dst, const AllocatorAttributes src_alloc_attr,
+ const AllocatorAttributes dst_alloc_attr,
+ const Tensor* input, Tensor* output,
+ StatusCallback done);
+
+ // Copies the data in 'gpu_tensor' into 'cpu_tensor'.
+ // 'gpu_tensor''s backing memory must be on 'gpu_device' and
+ // 'cpu_tensor' must be allocated to be of the same size as
+ // 'gpu_tensor'. Synchronous: may block.
+ static void CopyGPUTensorToCPU(Device* gpu_device,
+ const DeviceContext* device_context,
+ const Tensor* gpu_tensor, Tensor* cpu_tensor,
+ StatusCallback done);
+
+ // Blocks until all operations queued on the stream associated with
+ // "gpu_device" at the time of the call have completed. Returns any
+ // error pending on the stream at completion.
+ static Status Sync(Device* gpu_device);
+
+ // Blocks until all operations queued on all streams associated with the
+ // corresponding GPU device at the time of call have completed.
+ // Returns any error pending on the stream at completion.
+ static Status SyncAll(Device* gpu_device);
+
+ // For debugging purpose, given a "device" and a "tensor" allocated
+ // on the device, return a string printing each byte in the tensor
+ // (up to a limit). "device" can be either a CPU or a GPU device.
+ static string MemoryDebugString(const Device* device, Tensor* tensor);
+
+ static perftools::gputools::DeviceMemory<float> AsGPUFloat(const Tensor& t);
+
+ // Computes a checksum over the contents of "tensor", which is allocated
+ // on "gpu_device".
+ static uint64 Checksum(Device* gpu_device,
+ const DeviceContext* device_context,
+ const Tensor& tensor);
+
+ // Computes a checksum over the contents of "tensor", which is allocated
+ // in local CPU RAM.
+ static uint64 Checksum(const Tensor& tensor);
+
+ static void CopyCPUTensorToGPU(const Tensor* cpu_tensor,
+ const DeviceContext* device_context,
+ Device* gpu_device, Tensor* gpu_tensor,
+ StatusCallback done);
+};
+
+} // namespace tensorflow
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
new file mode 100644
index 0000000000..f1b1174a28
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
@@ -0,0 +1,24 @@
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/stream_executor/stream.h"
+
+namespace tensorflow {
+
+void GPUDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
+ Device* device,
+ Tensor* device_tensor,
+ StatusCallback done) const {
+ GPUUtil::CopyCPUTensorToGPU(cpu_tensor, this, device, device_tensor, done);
+}
+
+void GPUDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
+ const string& tensor_name,
+ Device* device, Tensor* cpu_tensor,
+ StatusCallback done) {
+ GPUUtil::CopyGPUTensorToCPU(device, this, device_tensor, cpu_tensor, done);
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.cc b/tensorflow/core/common_runtime/gpu/pool_allocator.cc
new file mode 100644
index 0000000000..52deb7fce2
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.cc
@@ -0,0 +1,269 @@
+#include "tensorflow/core/common_runtime/gpu/pool_allocator.h"
+
+#include <errno.h>
+#include <strings.h>
+#include <sys/mman.h> // for munmap
+
+#include <map>
+
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+//#include "prodkernel/api/base/numa.h"
+
+namespace tensorflow {
+
+PoolAllocator::PoolAllocator(size_t pool_size_limit, bool auto_resize,
+ SubAllocator* allocator,
+ RoundUpInterface* size_rounder, string name)
+ : name_(name),
+ has_size_limit_(pool_size_limit > 0),
+ auto_resize_(auto_resize),
+ pool_size_limit_(pool_size_limit),
+ allocator_(allocator),
+ size_rounder_(size_rounder),
+ allocation_begun_(false) {
+ if (auto_resize) {
+ CHECK_LT(0, pool_size_limit)
+ << "size limit must be > 0 if auto_resize is true.";
+ }
+}
+
+PoolAllocator::~PoolAllocator() { Clear(); }
+
+namespace {
+// Pools contain Chunks allocatated from the underlying Allocator.
+// Chunk alignment is always on kPoolAlignment boundaries. Each Chunk
+// begins with a descriptor (ChunkPrefix) that gives its size and a
+// pointer to itself. The pointer returned to the user is just past
+// the ChunkPrefix. If the user asks for a larger alignment, we will
+// increase the size of the chunk, then adjust the returned user
+// pointer and also re-write the ChunkPrefix.chunk_ptr value
+// immediately before it. This way the Chunk address and size can be
+// recovered from the returned user pointer, regardless of alignment.
+// Note that this deferencing of the pointers means that we cannot
+// handle GPU memory, only CPU memory.
+struct ChunkPrefix {
+ size_t num_bytes;
+ void* chunk_ptr;
+};
+// kPoolAlignment cannot be less than the size of ChunkPrefix.
+static const int kPoolAlignment = sizeof(ChunkPrefix);
+
+void* PrepareChunk(void* chunk, size_t alignment, size_t num_bytes) {
+ ChunkPrefix* cp = reinterpret_cast<ChunkPrefix*>(chunk);
+ cp->num_bytes = num_bytes;
+ cp->chunk_ptr = chunk;
+ void* user_ptr = reinterpret_cast<void*>(cp + 1);
+ if (alignment > kPoolAlignment) {
+ // Move user_ptr forward to the first satisfying offset, and write
+ // chunk_ptr just before it.
+ size_t aligned_ptr = reinterpret_cast<size_t>(user_ptr) + alignment;
+ user_ptr = reinterpret_cast<void*>(aligned_ptr & ~(alignment - 1));
+ (reinterpret_cast<ChunkPrefix*>(user_ptr) - 1)->chunk_ptr = chunk;
+ }
+ // Safety check that user_ptr is always past the ChunkPrefix.
+ CHECK_GE(user_ptr, reinterpret_cast<ChunkPrefix*>(chunk) + 1);
+ return user_ptr;
+}
+
+ChunkPrefix* FindPrefix(void* user_ptr) {
+ ChunkPrefix* cp = reinterpret_cast<ChunkPrefix*>(user_ptr) - 1;
+ return reinterpret_cast<ChunkPrefix*>(cp->chunk_ptr);
+}
+} // namespace
+
+void* PoolAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
+ if (!allocation_begun_) allocation_begun_ = true;
+ if (num_bytes == 0) return nullptr;
+
+ // If alignment is larger than kPoolAlignment, increase num_bytes so that we
+ // are guaranteed to be able to return an aligned ptr by advancing user_ptr
+ // without overrunning the end of the chunk.
+ if (alignment > kPoolAlignment) {
+ num_bytes += alignment;
+ }
+ num_bytes += sizeof(ChunkPrefix);
+ num_bytes = size_rounder_->RoundUp(num_bytes);
+ PtrRecord* pr = nullptr;
+ if (has_size_limit_) {
+ {
+ mutex_lock lock(mutex_);
+ auto iter = pool_.find(num_bytes);
+ if (iter == pool_.end()) {
+ allocated_count_++;
+ // Deliberately fall out of lock scope before
+ // calling the allocator. No further modification
+ // to the pool will be performed.
+ } else {
+ get_from_pool_count_++;
+ pr = iter->second;
+ RemoveFromList(pr);
+ pool_.erase(iter);
+ // Fall out of lock scope and do the result without the lock held.
+ }
+ }
+ }
+ if (pr != nullptr) {
+ void* r = pr->ptr;
+ delete pr;
+ return PrepareChunk(r, alignment, num_bytes);
+ } else {
+ void* ptr = allocator_->Alloc(kPoolAlignment, num_bytes);
+ for (auto v : alloc_visitors_) {
+ v(ptr, num_bytes);
+ }
+ return PrepareChunk(ptr, alignment, num_bytes);
+ }
+}
+
+void PoolAllocator::DeallocateRaw(void* ptr) {
+ if (ptr == nullptr) return;
+ ChunkPrefix* cp = FindPrefix(ptr);
+ CHECK_LE((void*)cp, (void*)ptr);
+ if (!has_size_limit_ && !auto_resize_) {
+ for (auto v : free_visitors_) {
+ v(cp, cp->num_bytes);
+ }
+ allocator_->Free(cp, cp->num_bytes);
+ } else {
+ mutex_lock lock(mutex_);
+ ++put_count_;
+ while (pool_.size() >= pool_size_limit_) {
+ EvictOne();
+ }
+ PtrRecord* pr = new PtrRecord;
+ pr->num_bytes = cp->num_bytes;
+ pr->ptr = cp;
+ AddToList(pr);
+ pool_.insert(std::make_pair(cp->num_bytes, pr));
+ }
+}
+
+void PoolAllocator::Clear() {
+ if (has_size_limit_) {
+ mutex_lock lock(mutex_);
+ for (auto iter : pool_) {
+ PtrRecord* pr = iter.second;
+ for (auto v : free_visitors_) {
+ v(pr->ptr, pr->num_bytes);
+ }
+ allocator_->Free(pr->ptr, pr->num_bytes);
+ delete pr;
+ }
+ pool_.clear();
+ get_from_pool_count_ = 0;
+ put_count_ = 0;
+ allocated_count_ = 0;
+ evicted_count_ = 0;
+ lru_head_ = nullptr;
+ lru_tail_ = nullptr;
+ }
+}
+
+void PoolAllocator::RemoveFromList(PtrRecord* pr) {
+ if (pr->prev == nullptr) {
+ DCHECK_EQ(lru_head_, pr);
+ lru_head_ = nullptr;
+ } else {
+ pr->prev->next = pr->next;
+ }
+ if (pr->next == nullptr) {
+ DCHECK_EQ(lru_tail_, pr);
+ lru_tail_ = pr->prev;
+ } else {
+ pr->next->prev = pr->prev;
+ if (lru_head_ == nullptr) {
+ lru_head_ = pr->next;
+ }
+ }
+}
+
+void PoolAllocator::AddToList(PtrRecord* pr) {
+ pr->prev = nullptr;
+ if (lru_head_ == nullptr) {
+ CHECK(lru_tail_ == nullptr);
+ lru_tail_ = pr;
+ pr->next = nullptr;
+ } else {
+ pr->next = lru_head_;
+ pr->next->prev = pr;
+ }
+ lru_head_ = pr;
+}
+
+void PoolAllocator::EvictOne() {
+ DCHECK(lru_tail_ != nullptr);
+ PtrRecord* prec = lru_tail_;
+ RemoveFromList(prec);
+ auto iter = pool_.find(prec->num_bytes);
+ while (iter->second != prec) {
+ ++iter;
+ DCHECK(iter != pool_.end());
+ }
+ pool_.erase(iter);
+ for (auto v : free_visitors_) {
+ v(prec->ptr, prec->num_bytes);
+ }
+ allocator_->Free(prec->ptr, prec->num_bytes);
+ delete prec;
+ ++evicted_count_;
+ // Auto-resizing, and warning messages.
+ static const double kTolerable = 2e-3;
+ static const int kCheckInterval = 1000;
+ static const double kIncreaseFactor = 1.1;
+ static const int kMinPoolSize = 100;
+ if (0 == evicted_count_ % kCheckInterval) {
+ const double eviction_rate =
+ evicted_count_ / static_cast<double>(put_count_);
+ const int64 alloc_request_count = allocated_count_ + get_from_pool_count_;
+ const double alloc_rate =
+ allocated_count_ / static_cast<double>(alloc_request_count);
+ static int log_counter = 0;
+ // (counter increment not thread safe but it's just for logging, so we
+ // don't care).
+ bool should_log = ((log_counter++ % 10) == 0);
+ if (should_log) {
+ LOG(WARNING) << "PoolAllocator: After " << alloc_request_count
+ << " get requests, put_count=" << put_count_
+ << " evicted_count=" << evicted_count_
+ << " eviction_rate=" << eviction_rate
+ << " and unsatisfied allocation rate=" << alloc_rate;
+ }
+ if (auto_resize_ && (eviction_rate > kTolerable) &&
+ (alloc_rate > kTolerable)) {
+ size_t new_size_limit = (pool_size_limit_ < kMinPoolSize)
+ ? kMinPoolSize
+ : (kIncreaseFactor * pool_size_limit_);
+ if (should_log) {
+ LOG(INFO) << "Raising pool_size_limit_ from " << pool_size_limit_
+ << " to " << new_size_limit;
+ }
+ pool_size_limit_ = new_size_limit;
+ // Reset all the counters so that ratios are relative to new sizes
+ // at next test interval.
+ put_count_ = 0;
+ allocated_count_ = 0;
+ evicted_count_ = 0;
+ get_from_pool_count_ = 0;
+ }
+ }
+}
+
+void PoolAllocator::AddAllocVisitor(Visitor visitor) {
+ mutex_lock lock(mutex_);
+ CHECK(!allocation_begun_)
+ << "AddAllocVisitor may not be called after pool allocation "
+ << "has begun.";
+ alloc_visitors_.push_back(visitor);
+}
+
+void PoolAllocator::AddFreeVisitor(Visitor visitor) {
+ mutex_lock lock(mutex_);
+ CHECK(!allocation_begun_)
+ << "AddFreeVisitor may not be called after pool allocation "
+ << "has begun.";
+ free_visitors_.push_back(visitor);
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
new file mode 100644
index 0000000000..d10aabe88a
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -0,0 +1,202 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_
+
+// Simple LRU pool allocators for various flavors of CPU RAM that
+// implement the VisitableAllocator interface. GPU memory is managed
+// by GPURegionAllocator.
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace tensorflow {
+
+// Interface of an object that does the underlying alloc/free of memory.
+class SubAllocator {
+ public:
+ virtual ~SubAllocator() {}
+ virtual void* Alloc(size_t alignment, size_t num_bytes) = 0;
+ virtual void Free(void* ptr, size_t num_bytes) = 0;
+};
+
+// Interface of an object that rounds up integers.
+class RoundUpInterface {
+ public:
+ virtual ~RoundUpInterface() {}
+ virtual size_t RoundUp(size_t num_bytes) = 0;
+};
+
+// Size-limited pool of memory buffers obtained from a SubAllocator
+// instance. Pool eviction policy is LRU.
+class PoolAllocator : public VisitableAllocator {
+ public:
+ // "pool_size_limit" is the maximum number of returned, re-usable
+ // memory buffers to keep in the pool. If pool_size_limit == 0, the
+ // pool is effectively a thin wrapper around the allocator.
+ // If "auto_resize" is true, then the pool_size_limit will gradually
+ // be raised so that deallocations happen very rarely, if at all.
+ // Transitory start-up objects may deallocate, but the long-term
+ // working-set should not. Auto-resizing can raise pool_size_limit
+ // but will never lower it.
+ // "allocator" is the object that performs the underlying memory
+ // malloc/free operations. This object takes ownership of allocator.
+ PoolAllocator(size_t pool_size_limit, bool auto_resize,
+ SubAllocator* allocator, RoundUpInterface* size_rounder,
+ string name);
+ ~PoolAllocator() override;
+
+ string Name() override { return name_; }
+
+ void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+
+ void DeallocateRaw(void* ptr) override;
+
+ // REQUIRES: The following functions may only be called prior
+ // to the first Allocate*() call. Once allocation has begun, it is
+ // illegal to register another visitor.
+
+ void AddAllocVisitor(Visitor visitor) override;
+
+ void AddFreeVisitor(Visitor visitor) override;
+
+ // Allocate an unused memory region of size "num_bytes". Fetch from
+ // the pool if available, otherwise call allocator_.
+ void* Get(size_t num_bytes);
+
+ // Return a no-longer needed memory region to the pool. It is an error
+ // to deference "ptr" after this call. If the pool is full, the least
+ // recently used region will be deallocated.
+ void Put(void* ptr, size_t num_bytes);
+
+ // Reset the pool to empty.
+ void Clear();
+
+ // The following accessors permit monitoring the effectiveness of
+ // the pool at avoiding repeated malloc/frees on the underlying
+ // allocator. Read locks are not taken on the theory that value
+ // consistency with other threads is not important.
+
+ // Number of Get() requests satisfied from pool.
+ int64 get_from_pool_count() const NO_THREAD_SAFETY_ANALYSIS {
+ return get_from_pool_count_;
+ }
+ // Number of Put() requests.
+ int64 put_count() const NO_THREAD_SAFETY_ANALYSIS { return put_count_; }
+ // Number of Get() requests requiring a fresh allocation.
+ int64 allocated_count() const NO_THREAD_SAFETY_ANALYSIS {
+ return allocated_count_;
+ }
+ // Number of pool evictions.
+ int64 evicted_count() const NO_THREAD_SAFETY_ANALYSIS {
+ return evicted_count_;
+ }
+ // Current size limit.
+ size_t size_limit() const NO_THREAD_SAFETY_ANALYSIS {
+ return pool_size_limit_;
+ }
+
+ private:
+ struct PtrRecord {
+ void* ptr;
+ size_t num_bytes;
+ PtrRecord* prev;
+ PtrRecord* next;
+ };
+
+ // Remove "pr" from the double-linked LRU list.
+ void RemoveFromList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Add "pr" to the head of the double-linked LRU list.
+ void AddToList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Delete the least recently used record.
+ void EvictOne() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ const string name_;
+ const bool has_size_limit_;
+ const bool auto_resize_;
+ size_t pool_size_limit_;
+ std::unique_ptr<SubAllocator> allocator_;
+ std::unique_ptr<RoundUpInterface> size_rounder_;
+ mutex mutex_;
+ std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_);
+ PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr;
+ PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr;
+ int64 get_from_pool_count_ GUARDED_BY(mutex_) = 0;
+ int64 put_count_ GUARDED_BY(mutex_) = 0;
+ int64 allocated_count_ GUARDED_BY(mutex_) = 0;
+ int64 evicted_count_ GUARDED_BY(mutex_) = 0;
+ // Write access to these is guarded by mutex_, but not read
+ // access. They may only be modified prior to the first
+ // allocation. Later attempts to modify will fail.
+ std::vector<Visitor> alloc_visitors_;
+ std::vector<Visitor> free_visitors_;
+ std::atomic<bool> allocation_begun_;
+};
+
+// Do-nothing rounder. Passes through sizes unchanged.
+class NoopRounder : public RoundUpInterface {
+ public:
+ size_t RoundUp(size_t num_bytes) override { return num_bytes; }
+};
+
+// Power of 2 rounder: rounds up to nearest power of 2 size.
+class Pow2Rounder : public RoundUpInterface {
+ public:
+ size_t RoundUp(size_t num_bytes) override {
+ return 1uLL << Log2Ceiling64(num_bytes);
+ }
+};
+
+class BasicCPUAllocator : public SubAllocator {
+ public:
+ ~BasicCPUAllocator() override {}
+
+ void* Alloc(size_t alignment, size_t num_bytes) override {
+ return port::aligned_malloc(num_bytes, alignment);
+ }
+ void Free(void* ptr, size_t num_bytes) override { free(ptr); }
+};
+
+// Allocator for pinned CPU RAM that is made known to CUDA for the
+// purpose of efficient DMA with a GPU.
+class CUDAHostAllocator : public SubAllocator {
+ public:
+ // Note: stream_exec cannot be null.
+ explicit CUDAHostAllocator(perftools::gputools::StreamExecutor* stream_exec)
+ : stream_exec_(stream_exec) {
+ CHECK(stream_exec_ != nullptr);
+ }
+ ~CUDAHostAllocator() override {}
+
+ void* Alloc(size_t alignment, size_t num_bytes) override {
+ void* ptr = nullptr;
+ if (num_bytes > 0) {
+ ptr = stream_exec_->HostMemoryAllocate(num_bytes);
+ if (ptr == nullptr) {
+ LOG(FATAL) << "could not allocate pinned host memory of size: "
+ << num_bytes;
+ }
+ }
+ return ptr;
+ }
+
+ void Free(void* ptr, size_t num_bytes) override {
+ if (ptr != nullptr) {
+ stream_exec_->HostMemoryDeallocate(ptr);
+ }
+ }
+
+ private:
+ perftools::gputools::StreamExecutor* stream_exec_; // not owned, non-null
+
+ TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
+};
+
+} // namespace tensorflow
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
new file mode 100644
index 0000000000..ca409b2b4c
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -0,0 +1,203 @@
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/gpu/pool_allocator.h"
+
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include <gtest/gtest.h>
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+namespace {
+
+TEST(PoolAllocatorTest, ZeroSizeBuffers) {
+ gpu::Platform* platform =
+ gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+ PoolAllocator pool(
+ 2 /*pool_size_limit*/, false /*auto_resize*/,
+ new CUDAHostAllocator(
+ platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+ .ValueOrDie()),
+ new NoopRounder, "pool");
+
+ EXPECT_EQ(nullptr, pool.AllocateRaw(4 /*alignment*/, 0 /*num_bytes*/));
+ pool.DeallocateRaw(nullptr); // Should not crash.
+ EXPECT_EQ(0, pool.get_from_pool_count());
+ EXPECT_EQ(0, pool.put_count());
+ EXPECT_EQ(0, pool.allocated_count());
+ EXPECT_EQ(0, pool.evicted_count());
+}
+
+TEST(PoolAllocatorTest, ZeroSizePool) {
+ gpu::Platform* platform =
+ gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+ PoolAllocator pool(
+ 0 /*pool_size_limit*/, false /*auto_resize*/,
+ new CUDAHostAllocator(
+ platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+ .ValueOrDie()),
+ new NoopRounder, "pool");
+
+ EXPECT_EQ(0, pool.get_from_pool_count());
+ EXPECT_EQ(0, pool.put_count());
+ EXPECT_EQ(0, pool.allocated_count());
+ EXPECT_EQ(0, pool.evicted_count());
+
+ // All allocations should bypass the pool and return valid pointers.
+ for (int i = 0; i < 3; ++i) {
+ void* p0 = pool.AllocateRaw(4, 0);
+ void* p4 = pool.AllocateRaw(4, 4);
+ void* p12 = pool.AllocateRaw(4, 12);
+ EXPECT_EQ(nullptr, p0);
+ EXPECT_NE(nullptr, p4);
+ EXPECT_NE(nullptr, p12);
+ pool.DeallocateRaw(p0);
+ pool.DeallocateRaw(p4);
+ pool.DeallocateRaw(p12);
+ }
+ EXPECT_EQ(0, pool.get_from_pool_count());
+ EXPECT_EQ(0, pool.put_count());
+ EXPECT_EQ(0, pool.allocated_count());
+ EXPECT_EQ(0, pool.evicted_count());
+}
+
+TEST(PoolAllocatorTest, Alignment) {
+ gpu::Platform* platform =
+ gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+ PoolAllocator pool(
+ 0 /*pool_size_limit*/, false /*auto_resize*/,
+ new CUDAHostAllocator(
+ platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+ .ValueOrDie()),
+ new NoopRounder, "pool");
+ for (int i = 0; i < 16; ++i) {
+ size_t alignment = 1 << i;
+ void* p = pool.AllocateRaw(alignment, 111);
+ EXPECT_TRUE(p != nullptr);
+ EXPECT_EQ(0, reinterpret_cast<int64>(p) & (alignment - 1))
+ << "ptr: " << p << " alignment " << alignment;
+ // Intentionally don't deallocate, to test that destruction of
+ // the PoolAllocator frees all pending memory.
+ }
+}
+
+TEST(PoolAllocatorTest, AutoResize) {
+ PoolAllocator pool(2 /*pool_size_limit*/, true /*auto_resize*/,
+ new BasicCPUAllocator, new NoopRounder, "pool");
+
+ // Alloc/dealloc 10 sizes just a few times, confirming pool size
+ // stays at 2.
+ for (int i = 0; i < 10; ++i) {
+ void* p = pool.AllocateRaw(4, 64 << i);
+ pool.DeallocateRaw(p);
+ }
+ EXPECT_EQ(0, pool.get_from_pool_count());
+ EXPECT_EQ(10, pool.allocated_count());
+ EXPECT_EQ(10, pool.put_count());
+ EXPECT_EQ(8, pool.evicted_count());
+ EXPECT_EQ(2, pool.size_limit());
+
+ // Then repeat 1200 times. Pool size limit should jump to 100.
+ for (int j = 0; j < 120; ++j) {
+ for (int i = 0; i < 10; ++i) {
+ void* p = pool.AllocateRaw(4, 64 << i);
+ pool.DeallocateRaw(p);
+ }
+ }
+ EXPECT_EQ(100, pool.size_limit());
+}
+
+TEST(PoolAllocatorTest, CudaHostAllocator) {
+ gpu::Platform* platform =
+ gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+ PoolAllocator pool(
+ 2 /*pool_size_limit*/, false /*auto_resize*/,
+ new CUDAHostAllocator(
+ platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+ .ValueOrDie()),
+ new NoopRounder, "pool");
+
+ // Repeatedly Get a 16-byte value, confirming that there's only
+ // one real allocation.
+ void* p1_16 = pool.AllocateRaw(4, 16);
+ EXPECT_EQ(0, pool.get_from_pool_count());
+ EXPECT_EQ(1, pool.allocated_count());
+ EXPECT_NE(nullptr, p1_16);
+ pool.DeallocateRaw(p1_16);
+ // Pool contents {16}
+ EXPECT_EQ(1, pool.put_count());
+ void* p2_16 = pool.AllocateRaw(4, 16); // Get it again.
+ EXPECT_EQ(1, pool.get_from_pool_count());
+ EXPECT_EQ(1, pool.allocated_count());
+ EXPECT_EQ(p1_16, p2_16); // Same pointer value
+ pool.DeallocateRaw(p2_16); // Put it back.
+ // Pool contents {16}
+ EXPECT_EQ(2, pool.put_count());
+
+ // Get two more values of different sizes.
+ void* p3_4 = pool.AllocateRaw(4, 4);
+ EXPECT_EQ(2, pool.allocated_count());
+ EXPECT_NE(p1_16, p3_4); // Different pointer value
+ EXPECT_NE(nullptr, p3_4);
+ pool.DeallocateRaw(p3_4); // Put it back. Pool is now full.
+ // Pool contents {4, 16}
+ EXPECT_EQ(3, pool.put_count());
+ void* p4_2 = pool.AllocateRaw(4, 2); // Get a third size buffer.
+ EXPECT_NE(nullptr, p4_2);
+ EXPECT_EQ(0, pool.evicted_count());
+
+ // The pool is full: when we put back p4_2, the 16-byte buffer
+ // should be evicted since it was least recently inserted.
+ pool.DeallocateRaw(p4_2);
+ // Pool contents {2, 4}
+ EXPECT_EQ(4, pool.put_count());
+ EXPECT_EQ(1, pool.evicted_count());
+
+ // Re-getting and putting size 2 or 4 should not alter pool size or
+ // num-evicted.
+ void* p5_4 = pool.AllocateRaw(4, 4);
+ EXPECT_NE(nullptr, p5_4);
+ pool.DeallocateRaw(p5_4);
+ void* p6_2 = pool.AllocateRaw(4, 2);
+ EXPECT_NE(nullptr, p6_2);
+ pool.DeallocateRaw(p6_2);
+ EXPECT_EQ(3, pool.get_from_pool_count());
+ EXPECT_EQ(6, pool.put_count());
+ EXPECT_EQ(3, pool.allocated_count());
+ EXPECT_EQ(1, pool.evicted_count());
+
+ pool.Clear();
+ EXPECT_EQ(0, pool.get_from_pool_count());
+ EXPECT_EQ(0, pool.put_count());
+ EXPECT_EQ(0, pool.allocated_count());
+ EXPECT_EQ(0, pool.evicted_count());
+}
+
+TEST(PoolAllocatorTest, Pow2Rounder) {
+ Pow2Rounder rounder;
+ EXPECT_EQ(1, rounder.RoundUp(1));
+ EXPECT_EQ(2, rounder.RoundUp(2));
+ EXPECT_EQ(16, rounder.RoundUp(9));
+ EXPECT_EQ(16, rounder.RoundUp(16));
+ EXPECT_EQ(65536, rounder.RoundUp(41234));
+ EXPECT_EQ(65536, rounder.RoundUp(65535));
+ EXPECT_EQ(65536, rounder.RoundUp(65536));
+}
+
+TEST(PoolAllocatorTest, Name) {
+ gpu::Platform* platform =
+ gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+ PoolAllocator pool(
+ 2 /*pool_size_limit*/, false /*auto_resize*/,
+ new CUDAHostAllocator(
+ platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+ .ValueOrDie()),
+ new NoopRounder, "pool");
+ EXPECT_EQ("pool", pool.Name());
+}
+
+} // namespace
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
new file mode 100644
index 0000000000..70ac6130c2
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -0,0 +1,220 @@
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_region_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/pool_allocator.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+
+#if defined(PLATFORM_GOOGLE)
+DEFINE_bool(record_mem_types, false,
+ "If true, record attributes of memory allocations and "
+ "dyanmically check for appropriate use of registered memory."
+ "Should only be true for debugging or diagnosis of "
+ "performance issues.");
+DEFINE_bool(brain_mem_reg_cuda_dma, true,
+ "If true, register CPU RAM used to copy to/from GPU RAM "
+ "with the CUDA driver.");
+DEFINE_bool(brain_gpu_use_bfc_allocator, false,
+ "If true, uses the Best-Fit GPU allocator.");
+DEFINE_bool(brain_gpu_region_allocator_debug, false,
+ "If true, checks for memory overwrites by writing "
+ "distinctive patterns on both ends of allocated memory.");
+DEFINE_bool(brain_gpu_region_allocator_reset_to_nan, false,
+ "If true, initializes all new Malloc buffers to NaN, "
+ "and resets the buffer to NaN upon Free.");
+
+#else
+bool FLAGS_record_mem_types = false;
+bool FLAGS_brain_mem_reg_cuda_dma = true;
+bool FLAGS_brain_gpu_region_allocator_debug = false;
+bool FLAGS_brain_gpu_region_allocator_reset_to_nan = false;
+bool FLAGS_brain_gpu_use_bfc_allocator = false;
+#endif
+
+namespace gpu = ::perftools::gputools;
+
+namespace tensorflow {
+
+ProcessState* ProcessState::instance_ = nullptr;
+
+/*static*/ ProcessState* ProcessState::singleton() {
+ if (instance_ == nullptr) {
+ instance_ = new ProcessState;
+ }
+
+ return instance_;
+}
+
+ProcessState::ProcessState() : gpu_count_(0) {
+ CHECK(instance_ == nullptr);
+ instance_ = this;
+}
+
+ProcessState::~ProcessState() {
+ for (auto p : gpu_allocators_) {
+ delete p;
+ }
+ instance_ = nullptr;
+}
+
+string ProcessState::MemDesc::DebugString() {
+ return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, ", dma: ",
+ gpu_registered, ", nic: ", nic_registered);
+}
+
+ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
+ if (FLAGS_record_mem_types) {
+ auto iter = mem_desc_map_.find(ptr);
+ if (iter != mem_desc_map_.end()) {
+ return iter->second;
+ }
+ }
+ return MemDesc();
+}
+
+void ProcessState::SetGPUCount(int c) {
+ CHECK(gpu_count_ == 0 || gpu_count_ == c)
+ << "Cannot call SetGPUCount with a non-zero value "
+ << "not equal to prior set value.";
+ gpu_count_ = c;
+}
+
+int ProcessState::GPUCount() const { return gpu_count_; }
+
+Allocator* ProcessState::GetGPUAllocator(int gpu_id, size_t total_bytes) {
+#if GOOGLE_CUDA
+ mutex_lock lock(mu_);
+ gpu::Platform* gpu_platform = GPUMachineManager();
+
+ // Verify that gpu_id is legitimate.
+ CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount())
+ << "gpu_id is outside discovered device range";
+
+ if (gpu_id >= static_cast<int64>(gpu_allocators_.size())) {
+ gpu_allocators_.resize(gpu_id + 1);
+ if (FLAGS_record_mem_types) gpu_al_.resize(gpu_id + 1);
+ }
+
+ if (gpu_allocators_[gpu_id] == nullptr) {
+ VisitableAllocator* gpu_allocator;
+
+ if (FLAGS_brain_gpu_use_bfc_allocator) {
+ gpu_allocator = new GPUBFCAllocator(gpu_id, total_bytes);
+ } else {
+ gpu_allocator = new GPURegionAllocator(gpu_id, total_bytes);
+ }
+
+ if (FLAGS_brain_gpu_region_allocator_debug) {
+ gpu_allocator = new GPUDebugAllocator(gpu_allocator, gpu_id);
+ }
+ if (FLAGS_brain_gpu_region_allocator_reset_to_nan) {
+ gpu_allocator = new GPUNanResetAllocator(gpu_allocator, gpu_id);
+ }
+
+ gpu_allocators_[gpu_id] = gpu_allocator;
+
+ // If there are any pending AllocVisitors for this bus, add
+ // them now.
+ gpu::StreamExecutor* se =
+ gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+ int bus_id = se->GetDeviceDescription().numa_node();
+ if (bus_id < static_cast<int64>(gpu_visitors_.size())) {
+ for (auto v : gpu_visitors_[bus_id]) {
+ gpu_allocators_[gpu_id]->AddAllocVisitor(v);
+ }
+ }
+ if (FLAGS_record_mem_types) {
+ MemDesc md;
+ md.loc = MemDesc::GPU;
+ md.dev_index = gpu_id;
+ md.gpu_registered = false;
+ md.nic_registered = true;
+ if (static_cast<int64>(gpu_al_.size()) <= gpu_id)
+ gpu_al_.resize(gpu_id + 1);
+ gpu_al_[gpu_id] = new internal::RecordingAllocator(
+ &mem_desc_map_, gpu_allocators_[gpu_id], md, &mu_);
+ }
+ }
+ if (FLAGS_record_mem_types) return gpu_al_[gpu_id];
+ return gpu_allocators_[gpu_id];
+#else
+ LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda.";
+ return nullptr;
+#endif // GOOGLE_CUDA
+}
+
+Allocator* ProcessState::GetCPUAllocator(int numa_node) {
+ // Although we're temporarily ignoring numa_node, check for legality.
+ CHECK_GE(numa_node, 0);
+ // TODO(tucker): actually maintain separate CPUAllocators for
+ // different numa_nodes. For now, just one.
+ numa_node = 0;
+ mutex_lock lock(mu_);
+ while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
+ cpu_allocators_.push_back(new PoolAllocator(
+ 100 /*pool_size_limit*/, true /*auto_resize*/, new BasicCPUAllocator(),
+ new NoopRounder, "cpu_pool"));
+ }
+ return cpu_allocators_[0];
+}
+
+Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
+ if (gpu_count_ == 0 || !FLAGS_brain_mem_reg_cuda_dma) {
+ return GetCPUAllocator(numa_node);
+ }
+ // Although we're temporarily ignoring numa_node, check for legality.
+ CHECK_GE(numa_node, 0);
+ // TODO(tucker): actually maintain separate CPUAllocators for
+ // different numa_nodes. For now, just one.
+ numa_node = 0;
+ mutex_lock lock(mu_);
+ while (static_cast<int>(cuda_host_allocators_.size()) <= numa_node) {
+ // CUDAHost alloc the same across all gpus, so just get the
+ // executor for the first device.
+ gpu::Platform* gpu_platform = GPUMachineManager();
+ gpu::StreamExecutor* se = gpu_platform->ExecutorForDevice(0).ValueOrDie();
+ CHECK(se);
+ cuda_host_allocators_.push_back(new PoolAllocator(
+ 100 /*pool_size_limit*/, true /*auto_resize*/,
+ new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host"));
+ if (FLAGS_record_mem_types) {
+ MemDesc md;
+ md.loc = MemDesc::CPU;
+ md.dev_index = 0;
+ md.gpu_registered = true;
+ md.nic_registered = false;
+ cuda_al_.push_back(new internal::RecordingAllocator(
+ &mem_desc_map_, cuda_host_allocators_.back(), md, &mu_));
+ }
+ }
+ if (FLAGS_record_mem_types) return cuda_al_[0];
+ return cuda_host_allocators_[0];
+}
+
+void ProcessState::AddGPUAllocVisitor(int bus_id, AllocVisitor visitor) {
+#if GOOGLE_CUDA
+ mutex_lock lock(mu_);
+ gpu::Platform* gpu_platform = GPUMachineManager();
+ for (int gpu_id = 0; gpu_id < static_cast<int64>(gpu_allocators_.size());
+ ++gpu_id) {
+ gpu::StreamExecutor* se =
+ gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+ if (gpu_allocators_[gpu_id] &&
+ se->GetDeviceDescription().numa_node() == bus_id) {
+ gpu_allocators_[gpu_id]->AddAllocVisitor(visitor);
+ }
+ }
+ while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
+ gpu_visitors_.push_back(std::vector<AllocVisitor>());
+ }
+ gpu_visitors_[bus_id].push_back(visitor);
+#endif // GOOGLE_CUDA
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/process_state.h b/tensorflow/core/common_runtime/gpu/process_state.h
new file mode 100644
index 0000000000..527d12c10d
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/process_state.h
@@ -0,0 +1,140 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_PROCESS_STATE_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_PROCESS_STATE_H_
+
+#include <functional>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/port.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+class Allocator;
+class VisitableAllocator;
+class PoolAllocator;
+
+// Singleton that manages per-process state, e.g. allocation
+// of shared resources.
+class ProcessState {
+ public:
+ static ProcessState* singleton();
+
+ // Descriptor for memory allocation attributes, used by optional
+ // runtime correctness analysis logic.
+ struct MemDesc {
+ enum MemLoc { CPU, GPU };
+ MemLoc loc;
+ int dev_index;
+ bool gpu_registered;
+ bool nic_registered;
+ MemDesc()
+ : loc(CPU),
+ dev_index(0),
+ gpu_registered(false),
+ nic_registered(false) {}
+ string DebugString();
+ };
+
+ // Records the number of GPUs available in the local process.
+ // It is a fatal error to call this with a value != to the value
+ // in a prior call.
+ void SetGPUCount(int c);
+
+ // Returns number of GPUs available in local process, as set by
+ // SetGPUCount(); Returns 0 if SetGPUCount has not been called.
+ int GPUCount() const;
+
+ // Returns what we know about the memory at ptr.
+ // If we know nothing, it's called CPU 0 with no other attributes.
+ MemDesc PtrType(const void* ptr);
+
+ // Returns the one CPUAllocator used for the given numa_node.
+ // TEMPORY: ignores numa_node.
+ Allocator* GetCPUAllocator(int numa_node);
+
+ // Returns the one GPU allocator used for the indexed GPU.
+ // Note that this is a system GPU index, not (necessarily) a brain
+ // device index.
+ //
+ // 'total_bytes' is the total number of bytes that should be made
+ // available to the allocator. The first call to this function for
+ // a given gpu_id creates the allocator, so only the total_bytes
+ // used on that first call is used.
+ //
+ // REQUIRES: gpu_id must be a valid ordinal for a GPU available in the
+ // current system environment. Otherwise returns nullptr.
+ Allocator* GetGPUAllocator(int gpu_id, size_t total_bytes);
+
+ Allocator* GetCUDAHostAllocator(int numa_node);
+
+ // Registers a function to be called once on every new Region
+ // allocated by every GPURegionAllocator proximate to the specified
+ // bus. The AllocVisitor is provided with a memory pointer and the
+ // size of the area it identifies. The pointer is not guaranteed to
+ // be valid after the call terminates. The intention is for this
+ // interface to be used for network device memory registration.
+ // "bus_id" is platform-specific. On many platforms it
+ // should be 0. On machines with multiple PCIe buses, it should be
+ // the index of one of the PCIe buses. If the the bus_id is invalid,
+ // results are undefined.
+ typedef std::function<void(void*, size_t)> AllocVisitor;
+ void AddGPUAllocVisitor(int bus_id, AllocVisitor visitor);
+
+ typedef std::unordered_map<const void*, MemDesc> MDMap;
+
+ protected:
+ ProcessState();
+
+ static ProcessState* instance_;
+
+ mutex mu_;
+ int gpu_count_;
+
+ std::vector<PoolAllocator*> cpu_allocators_ GUARDED_BY(mu_);
+ std::vector<VisitableAllocator*> gpu_allocators_ GUARDED_BY(mu_);
+ std::vector<std::vector<AllocVisitor>> gpu_visitors_ GUARDED_BY(mu_);
+ std::vector<PoolAllocator*> cuda_host_allocators_ GUARDED_BY(mu_);
+
+ virtual ~ProcessState();
+
+ // Optional RecordingAllocators that wrap the corresponding
+ // Allocators for runtime attribute use analysis.
+ MDMap mem_desc_map_;
+ std::vector<Allocator*> cpu_al_ GUARDED_BY(mu_);
+ std::vector<Allocator*> gpu_al_ GUARDED_BY(mu_);
+ std::vector<Allocator*> cuda_al_ GUARDED_BY(mu_);
+};
+
+namespace internal {
+class RecordingAllocator : public Allocator {
+ public:
+ RecordingAllocator(ProcessState::MDMap* mm, Allocator* a,
+ ProcessState::MemDesc md, mutex* mu)
+ : mm_(mm), a_(a), md_(md), mu_(mu) {}
+
+ string Name() override { return a_->Name(); }
+ void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+ void* p = a_->AllocateRaw(alignment, num_bytes);
+ mutex_lock l(*mu_);
+ (*mm_)[p] = md_;
+ return p;
+ }
+ void DeallocateRaw(void* p) override {
+ mutex_lock l(*mu_);
+ auto iter = mm_->find(p);
+ mm_->erase(iter);
+ a_->DeallocateRaw(p);
+ }
+ bool TracksAllocationSizes() override { return a_->TracksAllocationSizes(); }
+ size_t RequestedSize(void* p) override { return a_->RequestedSize(p); }
+ size_t AllocatedSize(void* p) override { return a_->AllocatedSize(p); }
+ ProcessState::MDMap* mm_; // not owned
+ Allocator* a_; // not owned
+ ProcessState::MemDesc md_;
+ mutex* mu_;
+};
+} // namespace internal
+} // namespace tensorflow
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_PROCESS_STATE_H_
diff --git a/tensorflow/core/common_runtime/gpu/visitable_allocator.h b/tensorflow/core/common_runtime/gpu/visitable_allocator.h
new file mode 100644
index 0000000000..23feed9aab
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/visitable_allocator.h
@@ -0,0 +1,30 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_
+
+#include <functional>
+#include "tensorflow/core/framework/allocator.h"
+
+namespace tensorflow {
+
+// Subclass VisitableAllocator instead of Allocator when a memory
+// allocator needs to enable some kind of registration/deregistration
+// of memory areas.
+class VisitableAllocator : public Allocator {
+ public:
+ // Visitor gets called with a pointer to a memory area and its
+ // size in bytes.
+ typedef std::function<void(void*, size_t)> Visitor;
+
+ // Register a visitor guaranteed to be called exactly once on each
+ // chunk of memory newly allocated from the underlying device.
+ // Typically, chunks will be reused and possibly sub-divided by a
+ // pool manager, so the calls will happen only once per process
+ // execution, not once per tensor (re)allocation.
+ virtual void AddAllocVisitor(Visitor visitor) = 0;
+
+ // Register a visitor guaranteed to be called on each chunk of
+ // memory returned to the underlying device.
+ virtual void AddFreeVisitor(Visitor visitor) = 0;
+};
+} // namespace tensorflow
+#endif // TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_