aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/gdr
diff options
context:
space:
mode:
authorGravatar Shanqing Cai <cais@google.com>2017-09-25 19:35:53 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-09-25 19:39:42 -0700
commite2e3a943c0a28b7656325acb3fcd035743d55ea0 (patch)
treef4b909d5410bdf3b94012392909e7805cd27a2a7 /tensorflow/contrib/gdr
parentdf22044be98c8b707601e03fe22ded53bcc28c7e (diff)
Merge changes from github.
END_PUBLIC --- Commit 1e1b3d902 authored by Pete Warden<pete@petewarden.com> Committed by gunan<gunan@google.com>: Changed output directory for Pi CI build to fix permissions problem with nightlies (#13257) * Fix for RTLD_GLOBAL breakage of Pi builds, and removed Eigen version change for Pi that's no longer needed * Fixed Pi Zero OpenBLAS build problems and tidied up directories used * More robust checks in Pi build script * Changed output directory for Pi CI build to fix permissions problem --- Commit fe3a2e65c authored by Yan Facai (???)<facai.yan@gmail.com> Committed by drpngx<drpngx@users.noreply.github.com>: check invalid string type for dest_nodes in extract_sub_graph (#13057) * BUG: check str type * TST: add unit test * CLN: remove list check * CLN: use warning * CLN: 2 indent * CLN: raise TypeError if not list * CLN: check string only --- Commit 225ab7629 authored by Jean Wanka<jm.wanka@gmail.com> Committed by Jean Wanka<jm.wanka@gmail.com>: Fix polynomial decay with cycle for global step=0 For polynomial decay with cycle=True the learning rate at step 0 becomes NaN, because in the process of calculating it we devide by 0. This change should fix it, by setting the multiplier for the decay steps to one for global_step=0. --- Commit 286f57061 authored by Bjarke Hammersholt Roune<broune@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Make Service::TransferToClient not attempt to manipulate the literal when the transfer failed, preventing a crash and allowing the caller to see the reason for the failed transfer. PiperOrigin-RevId: 169770126 --- Commit e0501bc4d authored by Yong Tang<yong.tang.github@outlook.com> Committed by Shanqing Cai<cais@google.com>: Fix GRUBlockCell parameter naming inconsistency (#13153) * Fix GRUBlockCell parameter naming inconsistency This fix tries to fix the issue in 13137 where parameter `cell_size` is used instead of `num_units`. This is inconsistent with other RNN cells. This fix adds support of `num_units` while at the same time maintains backward compatiblility for `cell_size`. This fix fixes 13137. Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Add `@deprecated_args` for 'cell_size' in `GRUBlockCell` This commit adds `@deprecated_args` for 'cell_size' in `GRUBlockCell` Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Address review comment Signed-off-by: Yong Tang <yong.tang.github@outlook.com> --- Commit 02a2eba05 authored by Pete Warden<pete@petewarden.com> Committed by gunan<gunan@google.com>: Fix for RTLD_GLOBAL breakage of Pi builds, and removed Eigen version change that's no longer needed (#13251) * Fix for RTLD_GLOBAL breakage of Pi builds, and removed Eigen version change for Pi that's no longer needed * Fixed Pi Zero OpenBLAS build problems and tidied up directories used * More robust checks in Pi build script --- Commit 8ef722253 authored by Sanjoy Das<sanjoy@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Remove a redundant setName. The EmitComputation should have emitted a function with the right name, so use a CHECK instead. PiperOrigin-RevId: 169764856 --- Commit 1b94147dc authored by Neal Wu<wun@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Fix broken GitHub links in tensorflow and tensorflow_models resulting from The Great Models Move (a.k.a. the research subfolder) PiperOrigin-RevId: 169763373 --- Commit b1ada5f0c authored by Justine Tunney<jart@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Fix TensorBoard python -m invoke in docs PiperOrigin-RevId: 169758752 --- Commit 2957cd894 authored by Mustafa Ispir<ispir@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Local run option of estimator training. PiperOrigin-RevId: 169756384 --- Commit 1dc2fe7ac authored by Gunhan Gulsoy<gunan@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: BEGIN_PUBLIC Automated g4 rollback of changelist 166264198 PiperOrigin-RevId: 169998124
Diffstat (limited to 'tensorflow/contrib/gdr')
-rw-r--r--tensorflow/contrib/gdr/gdr_memory_manager.cc182
-rw-r--r--tensorflow/contrib/gdr/gdr_memory_manager.h10
-rw-r--r--tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc22
-rw-r--r--tensorflow/contrib/gdr/gdr_worker.cc35
4 files changed, 150 insertions, 99 deletions
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index c55989e3e5..5c7ac74428 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -125,13 +125,15 @@ class GdrMemoryManager : public RemoteMemoryManager {
virtual void Stop() override;
- virtual Status TransportOptionsFromTensor(
+ virtual void TransportOptionsFromTensor(
::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
- Device* device, DeviceContext* device_context, bool on_host) override;
+ Device* device, DeviceContext* device_context, bool on_host,
+ StatusCallback done) override;
- virtual Status TensorFromTransportOptions(
+ virtual void TensorFromTransportOptions(
Tensor* tensor, const ::google::protobuf::Any& transport_options,
- Device* device, DeviceContext* device_context, bool on_host) override;
+ Device* device, DeviceContext* device_context, bool on_host,
+ StatusCallback done) override;
protected:
Status CreateEndpoint(const string& host, const string& port,
@@ -145,10 +147,6 @@ class GdrMemoryManager : public RemoteMemoryManager {
void InsertMemoryRegion(void* addr, size_t length);
-#if GOOGLE_CUDA
- void InsertCUDAMemoryRegion(void* addr, size_t length);
-#endif
-
void EvictMemoryRegion(void* addr, size_t length);
private:
@@ -415,45 +413,74 @@ void GdrMemoryManager::Run() {
void GdrMemoryManager::Stop() { stopped_ = true; }
-Status GdrMemoryManager::TransportOptionsFromTensor(
+void GdrMemoryManager::TransportOptionsFromTensor(
::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
- Device* device, DeviceContext* device_context, bool on_host) {
+ Device* device, DeviceContext* device_context, bool on_host,
+ StatusCallback done) {
auto buffer = DMAHelper::buffer(&tensor);
void* addr = buffer->data();
size_t length = buffer->size();
if (length == 0) {
- return errors::Unavailable("Cannot register tensor buffer of size 0");
+ done(errors::Unavailable("Cannot register tensor buffer of size 0"));
+ return;
}
ibv_mr* mr = FindMemoryRegion(addr, length);
- Tensor host_copy;
#if GOOGLE_CUDA
- if (!on_host && mr != nullptr) {
- TF_RETURN_IF_ERROR(GPUUtil::Sync(device));
- } else if (!on_host) {
+ if (!on_host) {
Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
- host_copy = Tensor(alloc, tensor.dtype(), tensor.shape());
- Status s;
- Notification n;
- GPUUtil::CopyGPUTensorToCPU(device, device_context, &tensor, &host_copy,
- [&s, &n](const Status& status) {
- s.Update(status);
- n.Notify();
- });
- n.WaitForNotification();
- if (!s.ok()) {
- return s;
- }
- buffer = DMAHelper::buffer(&host_copy);
- addr = buffer->data();
- length = buffer->size();
- mr = FindMemoryRegion(addr, length);
+ Tensor* host_copy = new Tensor(alloc, tensor.dtype(), tensor.shape());
+ GPUUtil::CopyGPUTensorToCPU(
+ device, device_context, &tensor, host_copy,
+ [done, host_copy, mutable_transport_options, this](const Status& s) {
+ if (!s.ok()) {
+ done(s);
+ delete host_copy;
+ return;
+ }
+ auto buffer = DMAHelper::buffer(host_copy);
+ void* addr = buffer->data();
+ size_t length = buffer->size();
+ ibv_mr* mr = FindMemoryRegion(addr, length);
+
+ if (mr == nullptr) {
+ done(errors::Unavailable("Cannot find pinned memory region"));
+ delete host_copy;
+ return;
+ }
+
+ buffer->Ref();
+ TensorKey tensor_key = next_key_++;
+ {
+ mutex_lock l(server_mu_);
+ tensor_buffers_.insert(std::make_pair(tensor_key, buffer));
+ }
+
+ uint64_t checksum = 0;
+ if (VLOG_IS_ON(2)) {
+ checksum = GPUUtil::Checksum(*host_copy);
+ }
+
+ RemoteMemoryRegion remote_mr;
+ remote_mr.set_host(host_);
+ remote_mr.set_port(port_);
+ remote_mr.set_addr(reinterpret_cast<uint64_t>(addr));
+ remote_mr.set_rkey(mr->rkey);
+ remote_mr.set_tensor_key(tensor_key);
+ remote_mr.set_checksum(checksum);
+ mutable_transport_options->PackFrom(remote_mr);
+
+ done(Status::OK());
+ delete host_copy;
+ });
+ return;
}
#endif
if (mr == nullptr) {
- return errors::Unavailable("Cannot find pinned memory region");
+ done(errors::Unavailable("Cannot find pinned memory region"));
+ return;
}
buffer->Ref();
@@ -466,12 +493,8 @@ Status GdrMemoryManager::TransportOptionsFromTensor(
uint64_t checksum = 0;
if (VLOG_IS_ON(2)) {
#ifdef GOOGLE_CUDA
- if (device->tensorflow_gpu_device_info() && (!on_host)) {
- if (host_copy.NumElements() > 0) {
- checksum = GPUUtil::Checksum(device, device_context, host_copy);
- } else {
- checksum = GPUUtil::Checksum(device, device_context, tensor);
- }
+ if (!on_host) {
+ checksum = GPUUtil::Checksum(device, device_context, tensor);
} else {
checksum = GPUUtil::Checksum(tensor);
}
@@ -487,15 +510,17 @@ Status GdrMemoryManager::TransportOptionsFromTensor(
remote_mr.set_checksum(checksum);
mutable_transport_options->PackFrom(remote_mr);
- return Status::OK();
+ done(Status::OK());
}
-Status GdrMemoryManager::TensorFromTransportOptions(
+void GdrMemoryManager::TensorFromTransportOptions(
Tensor* tensor, const ::google::protobuf::Any& transport_options,
- Device* device, DeviceContext* device_context, bool on_host) {
+ Device* device, DeviceContext* device_context, bool on_host,
+ StatusCallback done) {
RemoteMemoryRegion remote_mr;
if (!transport_options.UnpackTo(&remote_mr)) {
- return errors::NotFound("No RDMA transport options found");
+ done(errors::NotFound("No RDMA transport options found"));
+ return;
}
auto buffer = DMAHelper::buffer(tensor);
@@ -505,9 +530,7 @@ Status GdrMemoryManager::TensorFromTransportOptions(
Tensor host_copy;
#if GOOGLE_CUDA
- if (!on_host && mr != nullptr) {
- TF_RETURN_IF_ERROR(GPUUtil::Sync(device));
- } else if (!on_host) {
+ if (mr == nullptr && !on_host) {
Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
buffer = DMAHelper::buffer(&host_copy);
@@ -518,7 +541,8 @@ Status GdrMemoryManager::TensorFromTransportOptions(
#endif // GOOGLE_CUDA
if (mr == nullptr) {
- return errors::Unavailable("Cannot find pinned memory region");
+ done(errors::Unavailable("Cannot find pinned memory region"));
+ return;
}
decltype(clients_)::iterator iter;
@@ -529,8 +553,12 @@ Status GdrMemoryManager::TensorFromTransportOptions(
std::make_pair(std::make_pair(remote_mr.host(), remote_mr.port()),
RdmaEndpointPtr(nullptr, EndpointDeleter)));
if (success || iter->second.get() == nullptr) {
- TF_RETURN_IF_ERROR(
- CreateEndpoint(remote_mr.host(), remote_mr.port(), iter->second));
+ Status s =
+ CreateEndpoint(remote_mr.host(), remote_mr.port(), iter->second);
+ if (!s.ok()) {
+ done(s);
+ return;
+ }
}
}
rdma_cm_id* id = iter->second.get();
@@ -539,37 +567,57 @@ Status GdrMemoryManager::TensorFromTransportOptions(
if (rdma_post_read(id, nullptr, buffer->data(), buffer->size(), mr, 0,
remote_mr.addr(), remote_mr.rkey())) {
- return errors::Unavailable(strerror(errno), ": ", "rdma_post_read failed");
+ done(errors::Unavailable(strerror(errno), ": ", "rdma_post_read failed"));
+ return;
}
ibv_send_wr wr = {};
wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
wr.imm_data = htonl(remote_mr.tensor_key());
- wr.send_flags = IBV_SEND_FENCE | IBV_SEND_SIGNALED;
+ wr.send_flags = IBV_SEND_SIGNALED;
ibv_send_wr* bad_wr;
if (ibv_post_send(id->qp, &wr, &bad_wr)) {
- return errors::Unavailable(strerror(errno), ": ", "ibv_post_send failed");
+ done(errors::Unavailable(strerror(errno), ": ", "ibv_post_send failed"));
+ return;
}
ibv_wc wc = {};
- int ret = rdma_get_send_comp(id, &wc);
+ int ret;
+ while ((ret = ibv_poll_cq(id->send_cq, 1, &wc)) == 0)
+ ;
if (ret < 0 || wc.status) {
- return errors::Unavailable(ibv_wc_status_str(wc.status));
+ done(errors::Unavailable(ibv_wc_status_str(wc.status)));
+ return;
}
#if GOOGLE_CUDA
if (host_copy.NumElements() > 0) {
- Status s;
- Notification n;
- GPUUtil::CopyCPUTensorToGPU(&host_copy, device_context, device, tensor,
- [&s, &n](const Status& status) {
- s.Update(status);
- n.Notify();
- });
- n.WaitForNotification();
- if (!s.ok()) {
- return s;
+ uint64_t checksum = 0;
+ if (VLOG_IS_ON(2)) {
+ checksum = GPUUtil::Checksum(host_copy);
+ CHECK(checksum == remote_mr.checksum())
+ << "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum();
}
+ Tensor* ref = new Tensor;
+ std::swap(host_copy, *ref);
+ GPUUtil::CopyCPUTensorToGPU(
+ ref, device_context, device, tensor,
+ [ref, done, buffer, remote_mr, start](const Status& s) {
+ if (!s.ok()) {
+ done(s);
+ delete ref;
+ return;
+ }
+ uint64_t end = Env::Default()->NowMicros();
+
+ VLOG(2) << "RDMA from remote memory region " << remote_mr.rkey()
+ << " of size " << buffer->size() << " with tensor key "
+ << remote_mr.tensor_key() << " took " << (end - start)
+ << " micros";
+ done(Status::OK());
+ delete ref;
+ });
+ return;
}
#endif // GOOGLE_CUDA
@@ -583,11 +631,7 @@ Status GdrMemoryManager::TensorFromTransportOptions(
if (VLOG_IS_ON(2)) {
#ifdef GOOGLE_CUDA
if (device->tensorflow_gpu_device_info() && (!on_host)) {
- if (host_copy.NumElements() > 0) {
- checksum = GPUUtil::Checksum(device, device_context, host_copy);
- } else {
- checksum = GPUUtil::Checksum(device, device_context, *tensor);
- }
+ checksum = GPUUtil::Checksum(device, device_context, *tensor);
} else {
checksum = GPUUtil::Checksum(*tensor);
}
@@ -595,7 +639,7 @@ Status GdrMemoryManager::TensorFromTransportOptions(
<< "!=" << remote_mr.checksum();
#endif
}
- return Status::OK();
+ done(Status::OK());
}
Status GdrMemoryManager::CreateEndpoint(const string& host, const string& port,
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.h b/tensorflow/contrib/gdr/gdr_memory_manager.h
index e0e2a3f624..9ac1aa96c4 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.h
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.h
@@ -39,15 +39,17 @@ class RemoteMemoryManager {
// Encodes the tensor information to an arbitrary protocol buffer
// The protocol buffer needs to be transmitted via some other channel
- virtual Status TransportOptionsFromTensor(
+ virtual void TransportOptionsFromTensor(
::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
- Device* device, DeviceContext* device_context, bool on_host) = 0;
+ Device* device, DeviceContext* device_context, bool on_host,
+ StatusCallback done) = 0;
// Retrieve the tensor from the encoded protocol buffer
// Note that the tensor has to be allocated, but not initialized
- virtual Status TensorFromTransportOptions(
+ virtual void TensorFromTransportOptions(
Tensor* tensor, const ::google::protobuf::Any& transport_options,
- Device* device, DeviceContext* device_context, bool on_host) = 0;
+ Device* device, DeviceContext* device_context, bool on_host,
+ StatusCallback done) = 0;
};
RemoteMemoryManager* CreateRemoteMemoryManager(const string& host,
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 259ee8817d..adef2aac33 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -61,16 +61,20 @@ class GdrRecvTensorCall : public BaseRecvTensorCall {
const bool on_host =
(dst_device_->tensorflow_gpu_device_info() == nullptr) ||
recv_args_.alloc_attrs.on_host();
- Status s = remote_memory_manager_->TensorFromTransportOptions(
+ remote_memory_manager_->TensorFromTransportOptions(
const_cast<Tensor*>(&tensor()), transport_options, dst_device_,
- recv_args_.device_context, on_host);
- if (!s.ok()) {
- mutex_lock l(mu_);
- status_.Update(s);
- LOG(ERROR)
- << "Cannot find pinned memory region from allocator "
- << dst_device_->GetAllocator(recv_args_.alloc_attrs)->Name();
- }
+ recv_args_.device_context, on_host,
+ [this, recv_done](const Status& s) {
+ if (!s.ok()) {
+ mutex_lock l(mu_);
+ status_.Update(s);
+ LOG(ERROR) << "Cannot find pinned memory region from allocator "
+ << dst_device_->GetAllocator(recv_args_.alloc_attrs)
+ ->Name();
+ }
+ recv_done();
+ });
+ return;
}
if (!s.ok()) {
mutex_lock l(mu_);
diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc
index 0bff0aff6d..5686412347 100644
--- a/tensorflow/contrib/gdr/gdr_worker.cc
+++ b/tensorflow/contrib/gdr/gdr_worker.cc
@@ -86,24 +86,25 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
if (val.TotalBytes() > 0 && (!is_dead) &&
DMAHelper::CanUseDMA(&val) && dma_ok) {
// DMA cases.
- RecvTensorResponse proto;
- auto transport_options = proto.mutable_transport_options();
- Status s = remote_memory_manager_->TransportOptionsFromTensor(
+ RecvTensorResponse* proto = new RecvTensorResponse;
+ proto->set_is_dead(is_dead);
+ proto->set_send_start_micros(Env::Default()->NowMicros());
+ TensorProto* tensor_proto = proto->mutable_tensor();
+ tensor_proto->set_dtype(val.dtype());
+ val.shape().AsProto(tensor_proto->mutable_tensor_shape());
+ auto transport_options = proto->mutable_transport_options();
+ remote_memory_manager_->TransportOptionsFromTensor(
transport_options, val, src_dev, send_args.device_context,
- on_host);
- if (s.ok()) {
- proto.set_is_dead(is_dead);
- proto.set_send_start_micros(Env::Default()->NowMicros());
- TensorProto* tensor_proto = proto.mutable_tensor();
- tensor_proto->set_dtype(val.dtype());
- val.shape().AsProto(tensor_proto->mutable_tensor_shape());
- grpc::EncodeRecvTensorResponseToByteBuffer(proto, response);
- done(Status::OK());
- return;
- } else {
- done(s);
- return;
- }
+ on_host, [proto, done, response](const Status& s) {
+ if (s.ok()) {
+ grpc::EncodeRecvTensorResponseToByteBuffer(*proto,
+ response);
+ done(Status::OK());
+ } else {
+ done(s);
+ }
+ delete proto;
+ });
} else {
// Non-DMA cases.
if (src_dev->tensorflow_gpu_device_info() && (!on_host)) {