diff options
author | buchgr <buchgr@google.com> | 2017-09-26 10:41:37 +0200 |
---|---|---|
committer | Vladimir Moskva <vladmos@google.com> | 2017-09-26 12:31:53 +0200 |
commit | 926fd297ee08fa18d3206c25f74e70e0156d7aac (patch) | |
tree | 30826855154102e8666b872934b98f47d56ac9f2 /src/main/java/com/google/devtools/build/lib/buildeventservice | |
parent | 618f9816de7366dc01bca7f39a4622dbbd53fb75 (diff) |
Improve BES retries.
- The upload of build events is retried infinitely often as long as
between two failures progress was made, that is ACKs were received.
- Set the initial backoff after a failure to 0, and every further
backoff to 1s. This aligns relatively well with the backoff times
gRPC connections are retried [1].
PiperOrigin-RevId: 170022707
Diffstat (limited to 'src/main/java/com/google/devtools/build/lib/buildeventservice')
-rw-r--r-- | src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceTransport.java | 80 |
1 files changed, 46 insertions, 34 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceTransport.java b/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceTransport.java index f453add769..5df6580499 100644 --- a/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceTransport.java +++ b/src/main/java/com/google/devtools/build/lib/buildeventservice/BuildEventServiceTransport.java @@ -48,6 +48,8 @@ import com.google.devtools.build.lib.events.EventKind; import com.google.devtools.build.lib.runtime.BlazeModule.ModuleEnvironment; import com.google.devtools.build.lib.util.AbruptExitException; import com.google.devtools.build.lib.util.ExitCode; +import com.google.devtools.build.lib.util.JavaSleeper; +import com.google.devtools.build.lib.util.Sleeper; import com.google.devtools.build.v1.BuildStatus.Result; import com.google.devtools.build.v1.PublishBuildToolEventStreamRequest; import com.google.devtools.build.v1.PublishBuildToolEventStreamResponse; @@ -65,7 +67,6 @@ import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; -import java.util.concurrent.locks.LockSupport; import java.util.logging.Level; import java.util.logging.Logger; import javax.annotation.Nullable; @@ -90,8 +91,8 @@ public class BuildEventServiceTransport implements BuildEventTransport { private final BuildEventServiceProtoUtil besProtoUtil; private final ModuleEnvironment moduleEnvironment; private final EventHandler commandLineReporter; - private final PathConverter pathConverter; + private final Sleeper sleeper; /** Contains all pendingAck events that might be retried in case of failures. */ private ConcurrentLinkedDeque<PublishBuildToolEventStreamRequest> pendingAck; /** Contains all events should be sent ordered by sequence number. */ @@ -110,6 +111,11 @@ public class BuildEventServiceTransport implements BuildEventTransport { private volatile Exception lastKnownError; /** Returns true if we already reported a warning or error to UI. */ private volatile boolean errorsReported; + /** + * Returns the number of ACKs received since the last time {@link #publishEventStream()} was + * retried due to a failure. + */ + private volatile int acksReceivedSinceLastRetry; public BuildEventServiceTransport( BuildEventServiceClient besClient, @@ -124,29 +130,29 @@ public class BuildEventServiceTransport implements BuildEventTransport { PathConverter pathConverter, EventHandler commandLineReporter, @Nullable String projectId) { - this( - besClient, - uploadTimeout, - bestEffortUpload, - publishLifecycleEvents, - moduleEnvironment, - new BuildEventServiceProtoUtil(buildRequestId, invocationId, projectId, command, clock), - pathConverter, - commandLineReporter); + this(besClient, uploadTimeout, bestEffortUpload, publishLifecycleEvents, buildRequestId, + invocationId, command, moduleEnvironment, clock, pathConverter, commandLineReporter, + projectId, new JavaSleeper()); } @VisibleForTesting - BuildEventServiceTransport( + public BuildEventServiceTransport( BuildEventServiceClient besClient, Duration uploadTimeout, boolean bestEffortUpload, boolean publishLifecycleEvents, + String buildRequestId, + String invocationId, + String command, ModuleEnvironment moduleEnvironment, - BuildEventServiceProtoUtil besProtoUtil, + Clock clock, PathConverter pathConverter, - EventHandler commandLineReporter) { + EventHandler commandLineReporter, + @Nullable String projectId, + Sleeper sleeper) { this.besClient = besClient; - this.besProtoUtil = besProtoUtil; + this.besProtoUtil = + new BuildEventServiceProtoUtil(buildRequestId, invocationId, projectId, command, clock); this.publishLifecycleEvents = publishLifecycleEvents; this.moduleEnvironment = moduleEnvironment; this.commandLineReporter = commandLineReporter; @@ -162,6 +168,7 @@ public class BuildEventServiceTransport implements BuildEventTransport { this.invocationResult = UNKNOWN_STATUS; this.uploadTimeout = uploadTimeout; this.bestEffortUpload = bestEffortUpload; + this.sleeper = sleeper; } @Override @@ -449,7 +456,7 @@ public class BuildEventServiceTransport implements BuildEventTransport { } /** Method responsible for a single Streaming RPC. */ - private static void publishEventStream( + private void publishEventStream( final ConcurrentLinkedDeque<PublishBuildToolEventStreamRequest> pendingAck, final BlockingDeque<PublishBuildToolEventStreamRequest> pendingSend, final BuildEventServiceClient besClient) @@ -502,7 +509,8 @@ public class BuildEventServiceTransport implements BuildEventTransport { && event.getOrderedBuildEvent().getEvent().getEventCase() == COMPONENT_STREAM_FINISHED; } - private static Function<PublishBuildToolEventStreamResponse, Void> ackCallback( + @SuppressWarnings("NonAtomicVolatileUpdate") + private Function<PublishBuildToolEventStreamResponse, Void> ackCallback( final Deque<PublishBuildToolEventStreamRequest> pendingAck, final BuildEventServiceClient besClient) { return ack -> { @@ -522,43 +530,47 @@ public class BuildEventServiceTransport implements BuildEventTransport { logger.log(Level.INFO, "Last ACK received."); besClient.closeStream(); } + acksReceivedSinceLastRetry++; return null; }; } - private void retryOnException(Callable<?> c) throws Exception { - retryOnException(c, 3, 100); - } - /** * Executes a {@link Callable} retrying on exception thrown. */ // TODO(eduardocolaco): Implement transient/persistent failures - private void retryOnException(Callable<?> c, final int maxRetries, final long initalDelayMillis) - throws Exception { + private void retryOnException(Callable<?> c) throws Exception { + final int maxRetries = 5; + final long initialDelayMillis = 0; + final long delayMillis = 1000; + int tries = 0; while (tries <= maxRetries) { try { + acksReceivedSinceLastRetry = 0; c.call(); lastKnownError = null; return; } catch (InterruptedException e) { throw e; } catch (Exception e) { + if (acksReceivedSinceLastRetry > 0) { + logger.fine(String.format("ACKs received since last retry %d.", + acksReceivedSinceLastRetry)); + tries = 0; + } tries++; lastKnownError = e; - /* - * Exponential backoff: - * Retry 1: initalDelayMillis * 2^0 - * Retry 2: initalDelayMillis * 2^1 - * Retry 3: initalDelayMillis * 2^2 - * ... - */ - long sleepMillis = initalDelayMillis << (tries - 1); - String message = String.format("Retrying RPC to BES. Attempt %s. Backoff %s ms.", - tries, sleepMillis); + long sleepMillis; + if (tries == 1) { + sleepMillis = initialDelayMillis; + } else { + // This roughly matches the gRPC connection backoff. + sleepMillis = (long) (delayMillis * Math.pow(1.6, tries)); + } + String message = String.format("Retrying RPC to BES. Backoff %s ms.", sleepMillis); logger.log(Level.INFO, message, lastKnownError); - LockSupport.parkNanos(TimeUnit.MILLISECONDS.toNanos(sleepMillis)); + sleeper.sleepMillis(sleepMillis); } } Preconditions.checkNotNull(lastKnownError); |