diff options
author | 2018-06-27 12:52:51 -0700 | |
---|---|---|
committer | 2018-06-28 21:37:43 -0700 | |
commit | f5f67296c3430bae595697af3a78460e027cdc6d (patch) | |
tree | 06c4d7addb7820150d8620e79d41701f5db7b073 /tensorflow/core/common_runtime/collective_rma_local.cc | |
parent | 15f6b62aeef8292eddd6edbc9ed15cc49774218e (diff) |
Add GPUOptions::num_dev_to_dev_copy_streams to allow creation of
more than one device-to-device copy stream per GPU device.
This is an experimental feature that will have no effect unless
copy operations explicitly request a stream other than 0, which
currently does not occur anywhere in a standard build.
Eventually it may be of benefit in the presence of multiple
bi-directional concurrent data copies.
PiperOrigin-RevId: 202354513
Diffstat (limited to 'tensorflow/core/common_runtime/collective_rma_local.cc')
-rw-r--r-- | tensorflow/core/common_runtime/collective_rma_local.cc | 14 |
1 files changed, 8 insertions, 6 deletions
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc index 69f1a9f24c..288ae9d794 100644 --- a/tensorflow/core/common_runtime/collective_rma_local.cc +++ b/tensorflow/core/common_runtime/collective_rma_local.cc @@ -27,7 +27,8 @@ void CollectiveRemoteAccessLocal::RecvFromPeer( const string& peer_device, const string& peer_task, bool peer_is_local, const string& key, Device* to_device, DeviceContext* to_device_ctx, const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor, - const DeviceLocality& client_locality, const StatusCallback& done) { + const DeviceLocality& client_locality, int dev_to_dev_stream_index, + const StatusCallback& done) { VLOG(1) << "RecvFromPeer " << this << " from " << peer_device << " key " << key; if (!peer_is_local) { @@ -37,8 +38,9 @@ void CollectiveRemoteAccessLocal::RecvFromPeer( return; } buf_rendezvous_.ConsumeBuf( - key, [this, to_tensor, to_device_ctx, to_device, to_alloc_attr, done]( - const Status& s, BufRendezvous::Hook* hook) { + key, [this, to_tensor, to_device_ctx, to_device, to_alloc_attr, + dev_to_dev_stream_index, + done](const Status& s, BufRendezvous::Hook* hook) { if (!s.ok()) { done(s); delete hook; @@ -53,7 +55,7 @@ void CollectiveRemoteAccessLocal::RecvFromPeer( to_alloc_attr, // dst AllocatorAttributes hook->prod_value, // src Tensor* to_tensor, // dst Tensor* - [hook, done](const Status& s) { + dev_to_dev_stream_index, [hook, done](const Status& s) { // This callback may be executing in the GPUEventMgr // pool in which case it must be very short duration // and non-blocking (except e.g. for queue insertion). @@ -82,7 +84,7 @@ void CollectiveRemoteAccessLocal::MemCpyAsync( DeviceContext* src_dev_ctx, DeviceContext* dst_dev_ctx, Device* src_dev, Device* dst_dev, const AllocatorAttributes& src_attr, const AllocatorAttributes& dst_attr, const Tensor* src, Tensor* dst, - const StatusCallback& done) { + int dev_to_dev_stream_index, const StatusCallback& done) { // We want a real copy to happen, i.e. the bytes inside of src should be // transferred to the buffer backing dst. If src and dst are on different // devices then CopyTensor::ViaDMA will do just that. But if they're both @@ -115,7 +117,7 @@ void CollectiveRemoteAccessLocal::MemCpyAsync( if (non_cpu_src || non_cpu_dst) { CopyTensor::ViaDMA("", // edge name (non-existent) src_dev_ctx, dst_dev_ctx, src_dev, dst_dev, src_attr, - dst_attr, src, dst, done); + dst_attr, src, dst, dev_to_dev_stream_index, done); } else { int64 bytes = src->TotalBytes(); DCHECK_EQ(dst->TotalBytes(), bytes); |