aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/python/training
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-09-21 19:08:11 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-09-21 19:12:46 -0700
commit174e782ded74187fa81f034bb3cfedf2b100286d (patch)
tree9c3de64abc261eac44909d87bebfe601951813db /tensorflow/python/training
parent812d5505f5302944f7bdd815a5518bd289418b9d (diff)
Update error message upon a preemption error to highlight a potential
gRPC failure and suggest increasing the number of parameter servers. PiperOrigin-RevId: 214077622
Diffstat (limited to 'tensorflow/python/training')
-rw-r--r--tensorflow/python/training/monitored_session.py24
1 files changed, 20 insertions, 4 deletions
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 0e0125a956..82f0e3be52 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -1114,7 +1114,11 @@ class _RecoverableSession(_WrappedSession):
logging.info('An error was raised while a session was being created. '
'This may be due to a preemption of a connected worker '
'or parameter server. A new session will be created. '
- 'Error: %s', e)
+ 'This error may also occur due to a gRPC failure caused '
+ 'by high memory or network bandwidth usage in the '
+ 'parameter servers. If this error occurs repeatedly, try '
+ 'increasing the number of parameter servers assigned to '
+ 'the job. Error: %s', e)
def _check_stop(self):
try:
@@ -1127,7 +1131,11 @@ class _RecoverableSession(_WrappedSession):
'session is complete. This may be due to a preemption in '
'a connected worker or parameter server. The current '
'session will be closed and a new session will be '
- 'created. Error: %s', e)
+ 'created. This error may also occur due to a gRPC failure '
+ 'caused by high memory or network bandwidth usage in the '
+ 'parameter servers. If this error occurs repeatedly, try '
+ 'increasing the number of parameter servers assigned to '
+ 'the job. Error: %s', e)
self.close()
self._sess = self._create_session()
# Since we have just recreated the session, the overall computation should
@@ -1150,7 +1158,11 @@ class _RecoverableSession(_WrappedSession):
logging.info('An error was raised. This may be due to a preemption in '
'a connected worker or parameter server. The current '
'session will be closed and a new session will be '
- 'created. Error: %s', e)
+ 'created. This error may also occur due to a gRPC failure '
+ 'caused by high memory or network bandwidth usage in the '
+ 'parameter servers. If this error occurs repeatedly, try '
+ 'increasing the number of parameter servers assigned to '
+ 'the job. Error: %s', e)
self.close()
self._sess = None
@@ -1166,7 +1178,11 @@ class _RecoverableSession(_WrappedSession):
logging.info('An error was raised. This may be due to a preemption in '
'a connected worker or parameter server. The current '
'session will be closed and a new session will be '
- 'created. Error: %s', e)
+ 'created. This error may also occur due to a gRPC failure '
+ 'caused by high memory or network bandwidth usage in the '
+ 'parameter servers. If this error occurs repeatedly, try '
+ 'increasing the number of parameter servers assigned to '
+ 'the job. Error: %s', e)
self.close()
self._sess = None