aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/nccl
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-09-19 04:07:01 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-09-19 04:10:15 -0700
commit5882ae35d304f813313a4456c087237c29b63b64 (patch)
tree4e52ae1602f4f2a8eb198e47ee283a548bdf7f87 /tensorflow/contrib/nccl
parent23da21150d988f7cf5780488f24adbb116675586 (diff)
Update to NCCL version 1.3.5. Remove temporary buffer for ncclReduce, it's no longer needed in this version.
PiperOrigin-RevId: 169221983
Diffstat (limited to 'tensorflow/contrib/nccl')
-rw-r--r--tensorflow/contrib/nccl/kernels/nccl_manager.cc10
-rw-r--r--tensorflow/contrib/nccl/kernels/nccl_manager.h3
-rw-r--r--tensorflow/contrib/nccl/kernels/nccl_ops.cc11
3 files changed, 9 insertions, 15 deletions
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index 4b642f64c1..1eb1481675 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -312,11 +312,11 @@ void NcclManager::AddReduceSend(int num_devices, const string& key,
perftools::gputools::StreamExecutor* executor,
int gpu_device_id, EventMgr* event_mgr,
perftools::gputools::Stream* tensor_stream,
- const Tensor* in_t, Tensor* temp_t,
+ const Tensor* in_t,
DoneCallback done_callback) {
std::unique_ptr<Participant> participant(
- new Participant(in_t, temp_t, event_mgr, tensor_stream, executor,
- gpu_device_id, std::move(done_callback)));
+ new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream,
+ executor, gpu_device_id, std::move(done_callback)));
AddParticipant(num_devices, key, std::move(participant), in_t->dtype(),
kReduce, reduction_op);
}
@@ -462,7 +462,9 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
}
case kReduce: {
const void* sendbuff = p->in_t->tensor_data().data();
- void* recvbuff = const_cast<char*>(p->out_t->tensor_data().data());
+ void* recvbuff = p->out_t
+ ? const_cast<char*>(p->out_t->tensor_data().data())
+ : nullptr;
nccl_result = ncclReduce(sendbuff, recvbuff, p->in_t->NumElements(),
data_type, collective->reduction_op,
collective->root_rank, nccl_comm, *cu_stream);
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index 619a1b69bf..cb1719c3be 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -82,8 +82,7 @@ class NcclManager {
perftools::gputools::StreamExecutor* executor,
int gpu_device_id, EventMgr* event_mgr,
perftools::gputools::Stream* tensor_stream,
- const Tensor* in_t, Tensor* temp_t,
- DoneCallback done_callback);
+ const Tensor* in_t, DoneCallback done_callback);
void AddReduceRecv(int num_devices, const string& key,
ncclRedOp_t reduction_op,
perftools::gputools::StreamExecutor* executor,
diff --git a/tensorflow/contrib/nccl/kernels/nccl_ops.cc b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
index 81cc74416b..4eb52492db 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
@@ -121,14 +121,7 @@ class NcclReduceSendKernel : public NcclReduceOpBase {
: NcclReduceOpBase(c) {}
void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
- const Tensor& in_t = c->input(0);
- std::unique_ptr<Tensor> temp_ptr(new Tensor());
- OP_REQUIRES_OK_ASYNC(
- c, c->allocate_temp(in_t.dtype(), in_t.shape(), temp_ptr.get()), done);
- Tensor* temp_t = temp_ptr.release();
-
- auto actual_done = [c, done, temp_t](Status s) {
- delete temp_t;
+ auto actual_done = [c, done](Status s) {
OP_REQUIRES_OK_ASYNC(c, s, done);
done();
};
@@ -138,7 +131,7 @@ class NcclReduceSendKernel : public NcclReduceOpBase {
NcclManager::instance()->AddReduceSend(
num_devices(), GetCollectiveKey(c), reduction_op(),
compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
- compute_stream, &in_t, temp_t, std::move(actual_done));
+ compute_stream, &c->input(0), std::move(actual_done));
}
};
REGISTER_KERNEL_BUILDER(Name("NcclReduceSend").Device(DEVICE_GPU),