aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/common_runtime/collective_rma_local.cc
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-06-27 12:52:51 -0700
committerGravatar Gunhan Gulsoy <gunan@google.com>2018-06-28 21:37:43 -0700
commitf5f67296c3430bae595697af3a78460e027cdc6d (patch)
tree06c4d7addb7820150d8620e79d41701f5db7b073 /tensorflow/core/common_runtime/collective_rma_local.cc
parent15f6b62aeef8292eddd6edbc9ed15cc49774218e (diff)
Add GPUOptions::num_dev_to_dev_copy_streams to allow creation of
more than one device-to-device copy stream per GPU device. This is an experimental feature that will have no effect unless copy operations explicitly request a stream other than 0, which currently does not occur anywhere in a standard build. Eventually it may be of benefit in the presence of multiple bi-directional concurrent data copies. PiperOrigin-RevId: 202354513
Diffstat (limited to 'tensorflow/core/common_runtime/collective_rma_local.cc')
-rw-r--r--tensorflow/core/common_runtime/collective_rma_local.cc14
1 files changed, 8 insertions, 6 deletions
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index 69f1a9f24c..288ae9d794 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -27,7 +27,8 @@ void CollectiveRemoteAccessLocal::RecvFromPeer(
const string& peer_device, const string& peer_task, bool peer_is_local,
const string& key, Device* to_device, DeviceContext* to_device_ctx,
const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
- const DeviceLocality& client_locality, const StatusCallback& done) {
+ const DeviceLocality& client_locality, int dev_to_dev_stream_index,
+ const StatusCallback& done) {
VLOG(1) << "RecvFromPeer " << this << " from " << peer_device << " key "
<< key;
if (!peer_is_local) {
@@ -37,8 +38,9 @@ void CollectiveRemoteAccessLocal::RecvFromPeer(
return;
}
buf_rendezvous_.ConsumeBuf(
- key, [this, to_tensor, to_device_ctx, to_device, to_alloc_attr, done](
- const Status& s, BufRendezvous::Hook* hook) {
+ key, [this, to_tensor, to_device_ctx, to_device, to_alloc_attr,
+ dev_to_dev_stream_index,
+ done](const Status& s, BufRendezvous::Hook* hook) {
if (!s.ok()) {
done(s);
delete hook;
@@ -53,7 +55,7 @@ void CollectiveRemoteAccessLocal::RecvFromPeer(
to_alloc_attr, // dst AllocatorAttributes
hook->prod_value, // src Tensor*
to_tensor, // dst Tensor*
- [hook, done](const Status& s) {
+ dev_to_dev_stream_index, [hook, done](const Status& s) {
// This callback may be executing in the GPUEventMgr
// pool in which case it must be very short duration
// and non-blocking (except e.g. for queue insertion).
@@ -82,7 +84,7 @@ void CollectiveRemoteAccessLocal::MemCpyAsync(
DeviceContext* src_dev_ctx, DeviceContext* dst_dev_ctx, Device* src_dev,
Device* dst_dev, const AllocatorAttributes& src_attr,
const AllocatorAttributes& dst_attr, const Tensor* src, Tensor* dst,
- const StatusCallback& done) {
+ int dev_to_dev_stream_index, const StatusCallback& done) {
// We want a real copy to happen, i.e. the bytes inside of src should be
// transferred to the buffer backing dst. If src and dst are on different
// devices then CopyTensor::ViaDMA will do just that. But if they're both
@@ -115,7 +117,7 @@ void CollectiveRemoteAccessLocal::MemCpyAsync(
if (non_cpu_src || non_cpu_dst) {
CopyTensor::ViaDMA("", // edge name (non-existent)
src_dev_ctx, dst_dev_ctx, src_dev, dst_dev, src_attr,
- dst_attr, src, dst, done);
+ dst_attr, src, dst, dev_to_dev_stream_index, done);
} else {
int64 bytes = src->TotalBytes();
DCHECK_EQ(dst->TotalBytes(), bytes);