Update to NCCL version 1.3.5. Remove temporary buffer for ncclReduce, it's no longer needed in this version.

PiperOrigin-RevId: 169221983
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-09-19 04:07:01 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-09-19 04:10:15 -0700
commit: 5882ae35d304f813313a4456c087237c29b63b64 (patch)
tree: 4e52ae1602f4f2a8eb198e47ee283a548bdf7f87 /tensorflow/contrib/nccl
parent: 23da21150d988f7cf5780488f24adbb116675586 (diff)
3 files changed, 9 insertions, 15 deletions
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index 4b642f64c1..1eb1481675 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -312,11 +312,11 @@ void NcclManager::AddReduceSend(int num_devices, const string& key,
                                 perftools::gputools::StreamExecutor* executor,
                                 int gpu_device_id, EventMgr* event_mgr,
                                 perftools::gputools::Stream* tensor_stream,
-                                const Tensor* in_t, Tensor* temp_t,
+                                const Tensor* in_t,
                                 DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
-      new Participant(in_t, temp_t, event_mgr, tensor_stream, executor,
-                      gpu_device_id, std::move(done_callback)));
+      new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream,
+                      executor, gpu_device_id, std::move(done_callback)));
   AddParticipant(num_devices, key, std::move(participant), in_t->dtype(),
                  kReduce, reduction_op);
 }
@@ -462,7 +462,9 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
       }
       case kReduce: {
         const void* sendbuff = p->in_t->tensor_data().data();
-        void* recvbuff = const_cast<char*>(p->out_t->tensor_data().data());
+        void* recvbuff = p->out_t
+                             ? const_cast<char*>(p->out_t->tensor_data().data())
+                             : nullptr;
         nccl_result = ncclReduce(sendbuff, recvbuff, p->in_t->NumElements(),
                                  data_type, collective->reduction_op,
                                  collective->root_rank, nccl_comm, *cu_stream);
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index 619a1b69bf..cb1719c3be 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -82,8 +82,7 @@ class NcclManager {
                      perftools::gputools::StreamExecutor* executor,
                      int gpu_device_id, EventMgr* event_mgr,
                      perftools::gputools::Stream* tensor_stream,
-                     const Tensor* in_t, Tensor* temp_t,
-                     DoneCallback done_callback);
+                     const Tensor* in_t, DoneCallback done_callback);
   void AddReduceRecv(int num_devices, const string& key,
                      ncclRedOp_t reduction_op,
                      perftools::gputools::StreamExecutor* executor,
diff --git a/tensorflow/contrib/nccl/kernels/nccl_ops.cc b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
index 81cc74416b..4eb52492db 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
@@ -121,14 +121,7 @@ class NcclReduceSendKernel : public NcclReduceOpBase {
       : NcclReduceOpBase(c) {}
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    const Tensor& in_t = c->input(0);
-    std::unique_ptr<Tensor> temp_ptr(new Tensor());
-    OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(in_t.dtype(), in_t.shape(), temp_ptr.get()), done);
-    Tensor* temp_t = temp_ptr.release();
-
-    auto actual_done = [c, done, temp_t](Status s) {
-      delete temp_t;
+    auto actual_done = [c, done](Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
@@ -138,7 +131,7 @@ class NcclReduceSendKernel : public NcclReduceOpBase {
     NcclManager::instance()->AddReduceSend(
         num_devices(), GetCollectiveKey(c), reduction_op(),
         compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
-        compute_stream, &in_t, temp_t, std::move(actual_done));
+        compute_stream, &c->input(0), std::move(actual_done));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("NcclReduceSend").Device(DEVICE_GPU),
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-09-19 04:07:01 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-09-19 04:10:15 -0700
commit	5882ae35d304f813313a4456c087237c29b63b64 (patch)
tree	4e52ae1602f4f2a8eb198e47ee283a548bdf7f87 /tensorflow/contrib/nccl
parent	23da21150d988f7cf5780488f24adbb116675586 (diff)