diff options
author | 2018-09-21 19:08:11 -0700 | |
---|---|---|
committer | 2018-09-21 19:12:46 -0700 | |
commit | 174e782ded74187fa81f034bb3cfedf2b100286d (patch) | |
tree | 9c3de64abc261eac44909d87bebfe601951813db /tensorflow/python/training | |
parent | 812d5505f5302944f7bdd815a5518bd289418b9d (diff) |
Update error message upon a preemption error to highlight a potential
gRPC failure and suggest increasing the number of parameter servers.
PiperOrigin-RevId: 214077622
Diffstat (limited to 'tensorflow/python/training')
-rw-r--r-- | tensorflow/python/training/monitored_session.py | 24 |
1 files changed, 20 insertions, 4 deletions
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py index 0e0125a956..82f0e3be52 100644 --- a/tensorflow/python/training/monitored_session.py +++ b/tensorflow/python/training/monitored_session.py @@ -1114,7 +1114,11 @@ class _RecoverableSession(_WrappedSession): logging.info('An error was raised while a session was being created. ' 'This may be due to a preemption of a connected worker ' 'or parameter server. A new session will be created. ' - 'Error: %s', e) + 'This error may also occur due to a gRPC failure caused ' + 'by high memory or network bandwidth usage in the ' + 'parameter servers. If this error occurs repeatedly, try ' + 'increasing the number of parameter servers assigned to ' + 'the job. Error: %s', e) def _check_stop(self): try: @@ -1127,7 +1131,11 @@ class _RecoverableSession(_WrappedSession): 'session is complete. This may be due to a preemption in ' 'a connected worker or parameter server. The current ' 'session will be closed and a new session will be ' - 'created. Error: %s', e) + 'created. This error may also occur due to a gRPC failure ' + 'caused by high memory or network bandwidth usage in the ' + 'parameter servers. If this error occurs repeatedly, try ' + 'increasing the number of parameter servers assigned to ' + 'the job. Error: %s', e) self.close() self._sess = self._create_session() # Since we have just recreated the session, the overall computation should @@ -1150,7 +1158,11 @@ class _RecoverableSession(_WrappedSession): logging.info('An error was raised. This may be due to a preemption in ' 'a connected worker or parameter server. The current ' 'session will be closed and a new session will be ' - 'created. Error: %s', e) + 'created. This error may also occur due to a gRPC failure ' + 'caused by high memory or network bandwidth usage in the ' + 'parameter servers. If this error occurs repeatedly, try ' + 'increasing the number of parameter servers assigned to ' + 'the job. Error: %s', e) self.close() self._sess = None @@ -1166,7 +1178,11 @@ class _RecoverableSession(_WrappedSession): logging.info('An error was raised. This may be due to a preemption in ' 'a connected worker or parameter server. The current ' 'session will be closed and a new session will be ' - 'created. Error: %s', e) + 'created. This error may also occur due to a gRPC failure ' + 'caused by high memory or network bandwidth usage in the ' + 'parameter servers. If this error occurs repeatedly, try ' + 'increasing the number of parameter servers assigned to ' + 'the job. Error: %s', e) self.close() self._sess = None |