diff options
author | 2018-01-22 17:26:43 -0800 | |
---|---|---|
committer | 2018-01-22 17:30:59 -0800 | |
commit | 6042b5d267f42d004087b44c29525951700579f9 (patch) | |
tree | c4aa81597cc062a64b200758ce6dec5a1f5f16e5 /tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc | |
parent | 2968447d32bdfd0dd6fafabfcd1aafd6dc261803 (diff) |
Reject retried RecvTensor requests.
Retried RecvTensorRequests are problematic because a RecvTensor with no
corresponding sender will wait forever, and the tensor may have been delivered
to a previous retry.
This change adds a unique request_id to each RecvTensor request, and we check
these request_ids against a set of recent request_ids. If a request_id is in the
recent set, we reject the RecvTensor request.
PiperOrigin-RevId: 182863245
Diffstat (limited to 'tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc')
-rw-r--r-- | tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc | 8 |
1 files changed, 6 insertions, 2 deletions
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc index 1a2563d20f..8d14a3ef04 100644 --- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc +++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc @@ -33,8 +33,10 @@ limitations under the License. namespace tensorflow { MPIRendezvousMgr::MPIRendezvousMgr(const WorkerEnv* env) - : BaseRendezvousMgr(env), worker_env_2(env), use_optimal_transfer_(false) { - + : BaseRendezvousMgr(env), + worker_env_2(env), + use_optimal_transfer_(false), + recv_tensor_recent_request_ids_(100000) { const char* mpienv = getenv("MPI_OPTIMAL_PATH"); if (mpienv && mpienv[0] == '1') { LOG(INFO) << "MPI Optimal copy path enabled (Requires CUDA-Aware MPI when " @@ -149,6 +151,8 @@ MPIRemoteRendezvous::~MPIRemoteRendezvous() {} */ void MPIRendezvousMgr::AddRequest(RecvTensorRequest request, const int mpi_dst) { + TF_CHECK_OK(recv_tensor_recent_request_ids_.TrackUnique( + req.request_id(), "RecvTensor (MPIRendezvousMgr)", req)); const int64 step_id = request.step_id(); const std::string& key = request.rendezvous_key(); Rendezvous::ParsedKey parsed; |