aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/main/java/com/google/devtools/build
diff options
context:
space:
mode:
authorGravatar buchgr <buchgr@google.com>2017-09-12 12:41:31 +0200
committerGravatar Philipp Wollermann <philwo@google.com>2017-09-12 14:08:30 +0200
commit763d964b7428bbc16db7354332b5d43b93bbc9b0 (patch)
treeb9afa77afc46cf433fe8e7419bf1fb70cc3d3a02 /src/main/java/com/google/devtools/build
parent31089bba3aecc9192220ab91a0149560498f6965 (diff)
remote: Add new retrier with support for circuit breaking
Add a generic retrier implementation (Retrier2) that can be configured by plugging in a backoff strategy, a function to decide on retriable errors and a circuit breaker. A concrete implementation is added via RemoteRetrier that mostly is a copy of the code of the existing Retrier. Retrier2 adds support for circuit breaking [1]. It allows the retrier to reject execution when failure rates are high. The remote execution code will use this to gently switch between local and remote execution/caching if the latter experiences lots of failures. Retrier2 is also useful when not used with gRPC. We need retriers for the HTTP caching interface too. All the code added in this CL is unused, to keep reviews managable. In a follow up CL, I will switch the code to use the new Retrier and delete the old retrier. [1] https://martinfowler.com/bliki/CircuitBreaker.html PiperOrigin-RevId: 168355597
Diffstat (limited to 'src/main/java/com/google/devtools/build')
-rw-r--r--src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java179
-rw-r--r--src/main/java/com/google/devtools/build/lib/remote/Retrier2.java260
2 files changed, 439 insertions, 0 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java
new file mode 100644
index 0000000000..520a5f821a
--- /dev/null
+++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java
@@ -0,0 +1,179 @@
+// Copyright 2017 The Bazel Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.devtools.build.lib.remote;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.devtools.build.lib.util.Preconditions;
+import io.grpc.Status;
+import io.grpc.StatusException;
+import io.grpc.StatusRuntimeException;
+import java.time.Duration;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.Predicate;
+import java.util.function.Supplier;
+
+/**
+ * Specific retry logic for remote execution/caching.
+ *
+ * <p>A call can disable retries by throwing a {@link PassThroughException}.
+ * <code>
+ * RemoteRetrier r = ...;
+ * try {
+ * r.execute(() -> {
+ * // Not retried.
+ * throw PassThroughException(new IOException("fail"));
+ * }
+ * } catch (RetryException2 e) {
+ * // e.getCause() is the IOException
+ * System.out.println(e.getCause());
+ * }
+ * </code>
+ */
+class RemoteRetrier extends Retrier2 {
+
+ /**
+ * Wraps around an {@link Exception} to make it pass through a single layer of retries.
+ */
+ public static class PassThroughException extends Exception {
+ public PassThroughException(Exception e) {
+ super(e);
+ }
+ }
+
+ public static final Predicate<? super Exception> RETRIABLE_GRPC_ERRORS =
+ e -> {
+ if (!(e instanceof StatusException) && !(e instanceof StatusRuntimeException)) {
+ return false;
+ }
+ Status s = Status.fromThrowable(e);
+ switch (s.getCode()) {
+ case CANCELLED:
+ return !Thread.currentThread().isInterrupted();
+ case UNKNOWN:
+ case DEADLINE_EXCEEDED:
+ case ABORTED:
+ case INTERNAL:
+ case UNAVAILABLE:
+ case UNAUTHENTICATED:
+ return true;
+ default:
+ return false;
+ }
+ };
+
+ public RemoteRetrier(RemoteOptions options, Predicate<? super Exception> shouldRetry,
+ CircuitBreaker circuitBreaker) {
+ this(options.experimentalRemoteRetry
+ ? () -> new ExponentialBackoff(options)
+ : () -> RETRIES_DISABLED,
+ shouldRetry,
+ circuitBreaker);
+ }
+
+ public RemoteRetrier(Supplier<Backoff> backoff, Predicate<? super Exception> shouldRetry,
+ CircuitBreaker circuitBreaker) {
+ super(backoff, supportPassthrough(shouldRetry), circuitBreaker);
+ }
+
+ @VisibleForTesting
+ RemoteRetrier(Supplier<Backoff> backoff, Predicate<? super Exception> shouldRetry,
+ CircuitBreaker circuitBreaker, Sleeper sleeper) {
+ super(backoff, supportPassthrough(shouldRetry), circuitBreaker, sleeper);
+ }
+
+ @Override
+ public <T> T execute(Callable<T> call) throws RetryException2, InterruptedException {
+ try {
+ return super.execute(call);
+ } catch (RetryException2 e) {
+ if (e.getCause() instanceof PassThroughException) {
+ PassThroughException passThrough = (PassThroughException) e.getCause();
+ throw new RetryException2("Retries aborted because of PassThroughException",
+ e.getAttempts(), (Exception) passThrough.getCause());
+ }
+ throw e;
+ }
+ }
+
+
+ private static Predicate<? super Exception> supportPassthrough(
+ Predicate<? super Exception> delegate) {
+ // PassThroughException is not retriable.
+ return e -> !(e instanceof PassThroughException) && delegate.test(e);
+ }
+
+ static class ExponentialBackoff implements Retrier2.Backoff {
+
+ private final long maxMillis;
+ private long nextDelayMillis;
+ private int attempts = 0;
+ private final double multiplier;
+ private final double jitter;
+ private final int maxAttempts;
+
+ /**
+ * Creates a Backoff supplier for an optionally jittered exponential backoff. The supplier is
+ * ThreadSafe (non-synchronized calls to get() are fine), but the returned Backoff is not.
+ *
+ * @param initial The initial backoff duration.
+ * @param max The maximum backoff duration.
+ * @param multiplier The amount the backoff should increase in each iteration. Must be >1.
+ * @param jitter The amount the backoff should be randomly varied (0-1), with 0 providing no
+ * jitter, and 1 providing a duration that is 0-200% of the non-jittered duration.
+ * @param maxAttempts Maximal times to attempt a retry 0 means no retries.
+ */
+ ExponentialBackoff(Duration initial, Duration max, double multiplier, double jitter,
+ int maxAttempts) {
+ Preconditions.checkArgument(multiplier > 1, "multipler must be > 1");
+ Preconditions.checkArgument(jitter >= 0 && jitter <= 1, "jitter must be in the range (0, 1)");
+ Preconditions.checkArgument(maxAttempts >= 0, "maxAttempts must be >= 0");
+ nextDelayMillis = initial.toMillis();
+ maxMillis = max.toMillis();
+ this.multiplier = multiplier;
+ this.jitter = jitter;
+ this.maxAttempts = maxAttempts;
+ }
+
+ ExponentialBackoff(RemoteOptions options) {
+ this(Duration.ofMillis(options.experimentalRemoteRetryStartDelayMillis),
+ Duration.ofMillis(options.experimentalRemoteRetryMaxDelayMillis),
+ options.experimentalRemoteRetryMultiplier,
+ options.experimentalRemoteRetryJitter,
+ options.experimentalRemoteRetryMaxAttempts);
+ }
+
+ @Override
+ public long nextDelayMillis() {
+ if (attempts == maxAttempts) {
+ return -1;
+ }
+ attempts++;
+ double jitterRatio = jitter * (ThreadLocalRandom.current().nextDouble(2.0) - 1);
+ long result = (long) (nextDelayMillis * (1 + jitterRatio));
+ // Advance current by the non-jittered result.
+ nextDelayMillis = (long) (nextDelayMillis * multiplier);
+ if (nextDelayMillis > maxMillis) {
+ nextDelayMillis = maxMillis;
+ }
+ return result;
+ }
+
+ @Override
+ public int getRetryAttempts() {
+ return attempts;
+ }
+ }
+}
diff --git a/src/main/java/com/google/devtools/build/lib/remote/Retrier2.java b/src/main/java/com/google/devtools/build/lib/remote/Retrier2.java
new file mode 100644
index 0000000000..3f817d0fcc
--- /dev/null
+++ b/src/main/java/com/google/devtools/build/lib/remote/Retrier2.java
@@ -0,0 +1,260 @@
+// Copyright 2017 The Bazel Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.devtools.build.lib.remote;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.devtools.build.lib.remote.Retrier2.CircuitBreaker.State;
+import java.io.IOException;
+import java.util.concurrent.Callable;
+import java.util.concurrent.TimeUnit;
+import java.util.function.Predicate;
+import java.util.function.Supplier;
+import javax.annotation.concurrent.ThreadSafe;
+
+/**
+ * Supports retrying the execution of a {@link Callable} in case of failure.
+ *
+ * <p>The errors that are retried are configurable via a {@link Predicate<? super Exception>}. The
+ * delay between executions is specified by a {@link Backoff}. Additionally, the retrier supports
+ * circuit breaking to stop execution in case of high failure rates.
+ */
+// TODO(buchgr): Move to a different package and use it for BES code.
+@ThreadSafe
+class Retrier2 {
+
+ /**
+ * A backoff strategy.
+ */
+ public interface Backoff {
+
+ /**
+ * Returns the next delay in milliseconds, or a value less than {@code 0} if we should stop
+ * retrying.
+ */
+ long nextDelayMillis();
+
+ /**
+ * Returns the number of calls to {@link #nextDelayMillis()} thus far, not counting any calls
+ * that returned less than {@code 0}.
+ */
+ int getRetryAttempts();
+ }
+
+ /**
+ * The circuit breaker allows to reject execution when failure rates are high.
+ *
+ * <p>The initial state of a circuit breaker is the {@link State#ACCEPT_CALLS}. Calls are
+ * executed and retried in this state. However, if error rates are high a circuit breaker can
+ * choose to transition into {@link State#REJECT_CALLS}. In this state any calls are rejected with
+ * a {@link RetryException2} immediately. A circuit breaker in state {@link State#REJECT_CALLS}
+ * can periodically return a {@code TRIAL_CALL} state, in which case a call will be executed once
+ * and in case of success the circuit breaker may return to state {@code ACCEPT_CALLS}.
+ *
+ * <p>A circuit breaker implementation must be thread-safe.
+ *
+ * @see <a href = "https://martinfowler.com/bliki/CircuitBreaker.html">CircuitBreaker</a>
+ */
+ public interface CircuitBreaker {
+
+ enum State {
+ /**
+ * Calls are executed and retried in case of failure.
+ *
+ * <p>The circuit breaker can transition into state {@link State#REJECT_CALLS}.
+ */
+ ACCEPT_CALLS,
+
+ /**
+ * A call is executed and not retried in case of failure.
+ *
+ * <p>The circuit breaker can transition into any state.
+ */
+ TRIAL_CALL,
+
+ /**
+ * All calls are rejected.
+ *
+ * <p>The circuit breaker can transition into state {@link State#TRIAL_CALL}.
+ */
+ REJECT_CALLS
+ }
+
+ /**
+ * Returns the current {@link State} of the circuit breaker.
+ */
+ State state();
+
+ /**
+ * Called after an execution failed.
+ */
+ void recordFailure();
+
+ /**
+ * Called after an execution succeeded.
+ */
+ void recordSuccess();
+ }
+
+ public interface Sleeper {
+ void sleep(long millis) throws InterruptedException;
+ }
+
+ public static class RetryException2 extends IOException {
+
+ private final int attempts;
+
+ public RetryException2(String message, int numRetries, Exception cause) {
+ super(message, cause);
+ this.attempts = numRetries + 1;
+ }
+
+ protected RetryException2(String message) {
+ super(message);
+ this.attempts = 0;
+ }
+
+ /**
+ * Returns the number of times a {@link Callable} has been executed before this exception
+ * was thrown.
+ */
+ public int getAttempts() {
+ return attempts;
+ }
+ }
+
+ public static class CircuitBreakerException extends RetryException2 {
+
+ private CircuitBreakerException(String message, int numRetries, Exception cause) {
+ super(message, numRetries, cause);
+ }
+
+ private CircuitBreakerException() {
+ super("Call not executed due to a high failure rate.");
+ }
+ }
+
+ public static final CircuitBreaker ALLOW_ALL_CALLS = new CircuitBreaker() {
+ @Override
+ public State state() {
+ return State.ACCEPT_CALLS;
+ }
+
+ @Override
+ public void recordFailure() {
+ }
+
+ @Override
+ public void recordSuccess() {
+ }
+ };
+
+ public static final Backoff RETRIES_DISABLED = new Backoff() {
+ @Override
+ public long nextDelayMillis() {
+ return -1;
+ }
+
+ @Override
+ public int getRetryAttempts() {
+ return 0;
+ }
+ };
+
+ private final Supplier<Backoff> backoffSupplier;
+ private final Predicate<? super Exception> shouldRetry;
+ private final CircuitBreaker circuitBreaker;
+ private final Sleeper sleeper;
+
+ public Retrier2 (Supplier<Backoff> backoffSupplier, Predicate<? super Exception> shouldRetry,
+ CircuitBreaker circuitBreaker) {
+ this(backoffSupplier, shouldRetry, circuitBreaker, TimeUnit.MILLISECONDS::sleep);
+ }
+
+ @VisibleForTesting
+ Retrier2 (Supplier<Backoff> backoffSupplier, Predicate<? super Exception> shouldRetry,
+ CircuitBreaker circuitBreaker, Sleeper sleeper) {
+ this.backoffSupplier = backoffSupplier;
+ this.shouldRetry = shouldRetry;
+ this.circuitBreaker = circuitBreaker;
+ this.sleeper = sleeper;
+ }
+
+ /**
+ * Execute a {@link Callable}, retrying execution in case of failure and returning the result in
+ * case of success.
+ *
+ * <p>{@link InterruptedException} is not retried.
+ *
+ * @param call the {@link Callable} to execute.
+ * @throws RetryException2 if the {@code call} didn't succeed within the framework specified by
+ * {@code backoffSupplier} and {@code shouldRetry}.
+ * @throws CircuitBreakerException in case a call was rejected because the circuit breaker
+ * tripped.
+ * @throws InterruptedException if the {@code call} throws an {@link InterruptedException} or the
+ * current thread's interrupted flag is set.
+ */
+ public <T> T execute(Callable<T> call) throws RetryException2, InterruptedException {
+ final Backoff backoff = newBackoff();
+ while (true) {
+ final State circuitState;
+ circuitState = circuitBreaker.state();
+ if (State.REJECT_CALLS.equals(circuitState)) {
+ throw new CircuitBreakerException();
+ }
+ try {
+ if (Thread.interrupted()) {
+ throw new InterruptedException();
+ }
+ T r = call.call();
+ circuitBreaker.recordSuccess();
+ return r;
+ } catch (InterruptedException e) {
+ circuitBreaker.recordFailure();
+ throw e;
+ } catch (Exception e) {
+ circuitBreaker.recordFailure();
+ if (e instanceof RetryException2) {
+ // Support nested retry calls.
+ e = (Exception) e.getCause();
+ }
+ if (State.TRIAL_CALL.equals(circuitState)) {
+ throw new CircuitBreakerException("Call failed in circuit breaker half open state.", 0,
+ e);
+ }
+ if (!shouldRetry.test(e)) {
+ throw new RetryException2("Call failed with not retriable error.",
+ backoff.getRetryAttempts(), e);
+ }
+ final long delayMillis = backoff.nextDelayMillis();
+ if (delayMillis < 0) {
+ throw new RetryException2("Call failed after exhausting retry attempts: "
+ + backoff.getRetryAttempts(), backoff.getRetryAttempts(), e);
+ }
+ sleeper.sleep(delayMillis);
+ }
+ }
+ }
+
+ //TODO(buchgr): Add executeAsync to be used by ByteStreamUploader
+ // <T> ListenableFuture<T> executeAsync(AsyncCallable<T> call, ScheduledExecutorService executor)
+
+ public Backoff newBackoff() {
+ return backoffSupplier.get();
+ }
+
+ public boolean isRetriable(Exception e) {
+ return shouldRetry.test(e);
+ }
+}