diff options
author | buchgr <buchgr@google.com> | 2018-07-17 07:12:33 -0700 |
---|---|---|
committer | Copybara-Service <copybara-piper@google.com> | 2018-07-17 07:13:39 -0700 |
commit | b8977a4cd6cd3b2a83b0596d8e477f8d89988d8a (patch) | |
tree | 70f454368f1dcc9026a9f576f1d067bf3bf53e0e /src/main/java/com/google/devtools/build/lib/buildeventservice | |
parent | 5b5ace48f108916b800539e9db4a7d9c820a1158 (diff) |
bes: fix bug where the bes upload would lose events
If a local file upload fails, then the build event
gets lost. This was the case because of a bug in the
implementation where local file upload errors are
retried by the BES retrier. This is the root cause
for the "Non-consecutive sequence number" errors we
have been seeing recently.
RELNOTES: None
PiperOrigin-RevId: 204906550
Diffstat (limited to 'src/main/java/com/google/devtools/build/lib/buildeventservice')
-rw-r--r-- | src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceTransport.java | 78 |
1 files changed, 45 insertions, 33 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceTransport.java b/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceTransport.java index 09acb0c216..0db5212615 100644 --- a/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceTransport.java +++ b/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceTransport.java @@ -69,6 +69,7 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; @@ -118,7 +119,7 @@ public class BuildEventServiceTransport implements BuildEventTransport { * previous call was successful, this field is null. This is useful for error reporting, when an * upload times out due to having had to retry several times. */ - private volatile Exception lastKnownError; + private volatile Exception lastRetryError; /** Returns true if we already reported a warning or error to UI. */ private volatile boolean errorsReported; /** @@ -194,7 +195,16 @@ public class BuildEventServiceTransport implements BuildEventTransport { // fix would be to remove the spinning loop from publishEventStream and instead implement the // loop by publishEventStream re-submitting itself to the executor. // TODO(buchgr): Fix it. - this.uploaderExecutorService = listeningDecorator(Executors.newFixedThreadPool(2)); + this.uploaderExecutorService = listeningDecorator(Executors.newFixedThreadPool(2, + new ThreadFactory() { + + private final AtomicInteger count = new AtomicInteger(); + + @Override + public Thread newThread(Runnable r) { + return new Thread(r, "bes-uploader-" + count.incrementAndGet()); + } + })); this.protocolOptions = protocolOptions; this.invocationResult = UNKNOWN_STATUS; this.uploadTimeout = uploadTimeout; @@ -334,13 +344,13 @@ public class BuildEventServiceTransport implements BuildEventTransport { String message; if (t instanceof TimeoutException) { message = "Build Event Protocol upload timed out."; - Exception lastKnownError0 = lastKnownError; - if (lastKnownError0 != null) { + Exception lastRetryError0 = lastRetryError; + if (lastRetryError0 != null) { // We may at times get a timeout exception due to an underlying error that was retried // several times. If such an error exists, report it. message += " Transport errors caused the upload to be retried."; message += " Last known reason for retry: "; - message += besClient.userReadableError(lastKnownError0); + message += besClient.userReadableError(lastRetryError0); return message; } return message; @@ -356,7 +366,7 @@ public class BuildEventServiceTransport implements BuildEventTransport { } } - private void reportErrorAndFailBuild(Throwable t) { + protected void reportErrorAndFailBuild(Throwable t) { String message = errorMessageFromException(t); if (errorsShouldFailTheBuild) { commandLineReporter.handle(Event.error(message)); @@ -505,29 +515,8 @@ public class BuildEventServiceTransport implements BuildEventTransport { do { orderedBuildEvent = pendingSend.pollFirst(STREAMING_RPC_POLL_IN_SECS, TimeUnit.SECONDS); if (orderedBuildEvent != null) { - PathConverter pathConverter; - try { - // Wait for the local file upload to have been completed. - pathConverter = orderedBuildEvent.localFileUploadProgress().get(); - } catch (ExecutionException e) { - logger.log( - Level.WARNING, - String.format( - "Aborting publishBuildToolEventStream RPC because of a failure to " - + "upload referenced artifacts: %s", - e.getMessage()), - e); - besClient.abortStream(Status.INTERNAL.augmentDescription(e.getMessage())); - throw e; - } catch (InterruptedException e) { - // By convention the interrupted flag should have been cleared, - // but just to be sure clear it. - Thread.interrupted(); - String additionalDetails = "Uploading referenced artifacts"; - besClient.abortStream(Status.CANCELLED.augmentDescription(additionalDetails)); - throw e; - } pendingAck.add(orderedBuildEvent); + PathConverter pathConverter = waitForLocalFileUploads(orderedBuildEvent); besClient.sendOverStream(orderedBuildEvent.serialize(pathConverter)); } checkState(besClient.isStreamActive(), "Stream was closed prematurely."); @@ -575,6 +564,27 @@ public class BuildEventServiceTransport implements BuildEventTransport { } } + private PathConverter waitForLocalFileUploads(InternalOrderedBuildEvent orderedBuildEvent) + throws LocalFileUploadException, InterruptedException { + try { + // Wait for the local file upload to have been completed. + return orderedBuildEvent.localFileUploadProgress().get(); + } catch (ExecutionException e) { + logger.log( + Level.WARNING, + String.format( + "Failed to upload local files referenced by build event: %s", e.getMessage()), e); + throw new LocalFileUploadException(e.getCause()); + } + } + + private static class LocalFileUploadException extends Exception { + + public LocalFileUploadException(Throwable cause) { + super(cause); + } + } + private Function<PublishBuildToolEventStreamResponse, Void> ackCallback( final Deque<InternalOrderedBuildEvent> pendingAck, final BuildEventServiceClient besClient) { return ack -> { @@ -608,10 +618,12 @@ public class BuildEventServiceTransport implements BuildEventTransport { try { acksReceivedSinceLastRetry.set(0); c.call(); - lastKnownError = null; + lastRetryError = null; return; } catch (InterruptedException e) { throw e; + } catch (LocalFileUploadException e) { + throw (Exception) e.getCause(); } catch (Exception e) { if (acksReceivedSinceLastRetry.get() > 0) { logger.fine( @@ -620,7 +632,7 @@ public class BuildEventServiceTransport implements BuildEventTransport { tries = 0; } tries++; - lastKnownError = e; + lastRetryError = e; long sleepMillis; if (tries == 1) { sleepMillis = initialDelayMillis; @@ -629,12 +641,12 @@ public class BuildEventServiceTransport implements BuildEventTransport { sleepMillis = (long) (delayMillis * Math.pow(1.6, tries)); } String message = String.format("Retrying RPC to BES. Backoff %s ms.", sleepMillis); - logger.log(Level.INFO, message, lastKnownError); + logger.log(Level.INFO, message, lastRetryError); sleeper.sleepMillis(sleepMillis); } } - Preconditions.checkNotNull(lastKnownError); - throw lastKnownError; + Preconditions.checkNotNull(lastRetryError); + throw lastRetryError; } private void report(EventKind eventKind, String msg, Object... parameters) { |