aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib
diff options
context:
space:
mode:
authorGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-10-06 09:47:48 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-10-06 09:47:48 -0700
commit262f22f9eeee1ee00a9a92318d9a567a25c76696 (patch)
tree67eb30540f6219b69bed86499d18960fc85159c6 /tensorflow/contrib
parentfb92d456476c36210cea3b76393f584a306f092b (diff)
parentf16111286b19f4145df63b73c45be1645bde8737 (diff)
Merge pull request #22559 from byronyi:master
PiperOrigin-RevId: 216040541
Diffstat (limited to 'tensorflow/contrib')
-rw-r--r--tensorflow/contrib/gdr/gdr_memory_manager.cc156
1 files changed, 93 insertions, 63 deletions
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index bb06f1c41c..3549cedb70 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -22,7 +22,6 @@ limitations under the License.
#include <fstream>
#include <list>
#include <map>
-#include <set>
#include <fcntl.h>
#include <rdma/rdma_cma.h>
@@ -30,19 +29,17 @@ limitations under the License.
#include <sys/epoll.h>
#include "tensorflow/contrib/gdr/gdr.pb.h"
-#include "tensorflow/core/common_runtime/bfc_allocator.h"
#include "tensorflow/core/common_runtime/device.h"
#include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/common_runtime/pool_allocator.h"
#include "tensorflow/core/common_runtime/process_state.h"
#if GOOGLE_CUDA
#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
#endif // GOOGLE_CUDA
-#include "tensorflow/core/framework/allocator_registry.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
namespace tensorflow {
@@ -70,14 +67,11 @@ bool IsGDRAvailable() {
int TryToReadNumaNode(ibv_device* device) {
#if defined(__APPLE__)
LOG(INFO) << "OS X does not support NUMA - returning NUMA node 0";
- return 0;
+ return port::kNUMANoAffinity;
#elif defined(PLATFORM_WINDOWS)
// Windows support for NUMA is not currently implemented. Return node 0.
- return 0;
+ return port::kNUMANoAffinity;
#else
- VLOG(2) << "Trying to read NUMA node for device: " << device->name;
- static const int kUnknownNumaNode = -1;
-
auto filename = string(device->ibdev_path) + "/device/numa_node";
std::ifstream ifs(filename.c_str());
@@ -91,12 +85,12 @@ int TryToReadNumaNode(ibv_device* device) {
<< value
<< "), but there must be at least one NUMA node"
", so returning NUMA node zero";
- return 0;
+ return port::kNUMANoAffinity;
}
LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
return value;
}
- return kUnknownNumaNode;
+ return port::kNUMANoAffinity;
#endif
}
@@ -138,8 +132,6 @@ class GdrMemoryManager : public RemoteMemoryManager {
Device* device, DeviceContext* device_context, bool on_host,
StatusCallback done) override;
- static void RegMemVisitors();
-
protected:
Status CreateEndpoint(const string& host, const string& port,
RdmaEndpointPtr& endpoint);
@@ -150,7 +142,8 @@ class GdrMemoryManager : public RemoteMemoryManager {
ibv_mr* FindMemoryRegion(void* addr, size_t length);
- void InsertMemoryRegion(void* addr, size_t length);
+ void InsertMemoryRegion(void* addr, size_t length,
+ const std::string& allocator_name);
void EvictMemoryRegion(void* addr, size_t length);
@@ -160,6 +153,7 @@ class GdrMemoryManager : public RemoteMemoryManager {
RdmaEndpointPtr listening_;
std::atomic<bool> stopped_;
int epfd_;
+ int numa_node_;
// Server side endpoints
// Accessed sequentially in Run() so not protected by lock
@@ -190,46 +184,10 @@ GdrMemoryManager::GdrMemoryManager(const string& host, const string& port)
port_(port),
listening_(nullptr, EndpointDeleter),
stopped_(true),
- next_key_(0) {
- static std::once_flag flag;
- std::call_once(flag, []() { RegMemVisitors(); });
-}
+ next_key_(0) {}
GdrMemoryManager::~GdrMemoryManager() { close(epfd_); }
-/*static*/ void GdrMemoryManager::RegMemVisitors() {
- SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node,
- size_t num_bytes) {
- GdrMemoryManager::Singleton().InsertMemoryRegion(
- ptr, num_bytes, strings::StrCat("CPU:", numa_node));
- };
- SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node,
- size_t num_bytes) {
- GdrMemoryManager::Singleton().EvictMemoryRegion(ptr, num_bytes);
- };
- ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
- ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
-
-#if GOOGLE_CUDA
- if (IsGDRAvailable()) {
- int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1;
-
- // Note we don't free allocated GPU memory so there is no free visitor
- SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id,
- size_t num_bytes) {
- RdmaMemoryMgr::Singleton().InsertMemoryRegion(
- ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
- };
- GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
- cuda_alloc_visitor);
- GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id,
- alloc_visitor);
- GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor);
- LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
- }
-#endif // GOOGLE_CUDA
-}
-
Status GdrMemoryManager::Init() {
epfd_ = epoll_create1(0);
if (epfd_ == -1) {
@@ -289,6 +247,42 @@ Status GdrMemoryManager::Init() {
"cannot add server to epoll");
}
+ numa_node_ = TryToReadNumaNode(listening_->verbs->device);
+
+ SubAllocator::Visitor alloc_visitor = [this](void* ptr, int numa_node,
+ size_t num_bytes) {
+ VLOG(2) << "Registering RDMA capable memory region on numa_node "
+ << numa_node;
+ InsertMemoryRegion(ptr, num_bytes, strings::StrCat("CPU:", numa_node));
+ };
+ SubAllocator::Visitor free_visitor = [this](void* ptr, int numa_node,
+ size_t num_bytes) {
+ VLOG(2) << "De-registering RDMA capable memory region on numa_node "
+ << numa_node;
+ EvictMemoryRegion(ptr, num_bytes);
+ };
+ ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
+ ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
+ LOG(INFO) << "Instrumenting CPU allocator(s)";
+
+#if GOOGLE_CUDA
+ if (IsGDRAvailable()) {
+ int bus_id = numa_node_ + 1;
+
+ SubAllocator::Visitor cuda_alloc_visitor = [this](void* ptr, int gpu_id,
+ size_t num_bytes) {
+ VLOG(2) << "Registering RDMA capable memory region on GPU " << gpu_id;
+ InsertMemoryRegion(ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
+ };
+ GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
+ cuda_alloc_visitor);
+ GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id,
+ alloc_visitor);
+ GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor);
+ LOG(INFO) << "Instrumenting GPU allocator(s) with bus_id " << bus_id;
+ }
+#endif // GOOGLE_CUDA
+
return Status::OK();
}
@@ -405,7 +399,7 @@ void GdrMemoryManager::TransportOptionsFromTensor(
ibv_mr* mr = FindMemoryRegion(addr, length);
#if GOOGLE_CUDA
- if (!on_host) {
+ if (device->tensorflow_gpu_device_info() && !on_host) {
Allocator* alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
Tensor* host_copy = new Tensor(alloc, tensor.dtype(), tensor.shape());
GPUUtil::CopyGPUTensorToCPU(
@@ -456,11 +450,27 @@ void GdrMemoryManager::TransportOptionsFromTensor(
#endif
if (mr == nullptr) {
- done(errors::Unavailable("Cannot find pinned memory region"));
- return;
+ Allocator* alloc = ProcessState::singleton()->GetCPUAllocator(numa_node_);
+ Tensor host_copy(alloc, tensor.dtype(), tensor.shape());
+
+ std::memcpy(DMAHelper::buffer(&host_copy)->data(), buffer->data(), length);
+ VLOG(2) << "Copying " << length << " bytes unpinned tensor buffer";
+
+ buffer = DMAHelper::buffer(&host_copy);
+ addr = buffer->data();
+ length = buffer->size();
+
+ mr = FindMemoryRegion(addr, length);
+ if (mr == nullptr) {
+ done(errors::Unavailable("Cannot find pinned memory region"));
+ return;
+ }
+
+ buffer->Ref();
+ } else {
+ buffer->Ref();
}
- buffer->Ref();
TensorKey tensor_key = next_key_++;
{
mutex_lock l(server_mu_);
@@ -470,7 +480,7 @@ void GdrMemoryManager::TransportOptionsFromTensor(
uint64_t checksum = 0;
if (VLOG_IS_ON(2)) {
#ifdef GOOGLE_CUDA
- if (!on_host) {
+ if (device->tensorflow_gpu_device_info() && !on_host) {
checksum = GPUUtil::Checksum(device, device_context, tensor);
} else {
checksum = GPUUtil::Checksum(tensor);
@@ -508,7 +518,8 @@ void GdrMemoryManager::TensorFromTransportOptions(
Tensor host_copy;
#if GOOGLE_CUDA
if (mr == nullptr && !on_host) {
- Allocator* alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
+ Allocator* alloc =
+ GPUProcessState::singleton()->GetCUDAHostAllocator(numa_node_);
host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
buffer = DMAHelper::buffer(&host_copy);
addr = buffer->data();
@@ -518,8 +529,18 @@ void GdrMemoryManager::TensorFromTransportOptions(
#endif // GOOGLE_CUDA
if (mr == nullptr) {
- done(errors::Unavailable("Cannot find pinned memory region"));
- return;
+ Allocator* alloc = ProcessState::singleton()->GetCPUAllocator(numa_node_);
+ host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
+
+ buffer = DMAHelper::buffer(&host_copy);
+ addr = buffer->data();
+ length = buffer->size();
+
+ mr = FindMemoryRegion(addr, length);
+ if (mr == nullptr) {
+ done(errors::Unavailable("Cannot find pinned memory region"));
+ return;
+ }
}
decltype(clients_)::iterator iter;
@@ -568,7 +589,8 @@ void GdrMemoryManager::TensorFromTransportOptions(
}
#if GOOGLE_CUDA
- if (host_copy.NumElements() > 0) {
+ if (device->tensorflow_gpu_device_info() && !on_host &&
+ host_copy.NumElements() > 0) {
uint64_t checksum = 0;
if (VLOG_IS_ON(2)) {
checksum = GPUUtil::Checksum(host_copy);
@@ -598,6 +620,12 @@ void GdrMemoryManager::TensorFromTransportOptions(
}
#endif // GOOGLE_CUDA
+ if ((on_host || !device->tensorflow_gpu_device_info()) &&
+ host_copy.NumElements() > 0) {
+ std::memcpy(DMAHelper::buffer(tensor)->data(), addr, length);
+ VLOG(2) << "Copying " << length << " bytes unpinned tensor buffer";
+ }
+
uint64_t end = Env::Default()->NowMicros();
VLOG(2) << "RDMA from remote memory region " << remote_mr.rkey()
@@ -607,7 +635,7 @@ void GdrMemoryManager::TensorFromTransportOptions(
uint64_t checksum = 0;
if (VLOG_IS_ON(2)) {
#ifdef GOOGLE_CUDA
- if (device->tensorflow_gpu_device_info() && (!on_host)) {
+ if (device->tensorflow_gpu_device_info() && !on_host) {
checksum = GPUUtil::Checksum(device, device_context, *tensor);
} else {
checksum = GPUUtil::Checksum(*tensor);
@@ -668,7 +696,8 @@ ibv_mr* GdrMemoryManager::FindMemoryRegion(void* addr, size_t length) {
}
}
-void GdrMemoryManager::InsertMemoryRegion(void* addr, size_t length) {
+void GdrMemoryManager::InsertMemoryRegion(void* addr, size_t length,
+ const std::string& allocator_name) {
if (length == 0) return;
ibv_mr* mr = rdma_reg_read(listening_.get(), addr, length);
if (mr != nullptr) {
@@ -676,7 +705,8 @@ void GdrMemoryManager::InsertMemoryRegion(void* addr, size_t length) {
auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
mrs_.insert(iter, {mr, &MRDeleter});
} else {
- LOG(WARNING) << "Cannot register memory region";
+ LOG(WARNING) << "Cannot register memory region allocated by "
+ << allocator_name;
}
}