aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java111
-rw-r--r--src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java22
-rw-r--r--src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java4
-rw-r--r--src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java78
-rw-r--r--src/test/java/com/google/devtools/build/lib/concurrent/ErrorClassifierTest.java38
5 files changed, 190 insertions, 63 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java b/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java
index 9db1c22021..baab8847e7 100644
--- a/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java
+++ b/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java
@@ -61,12 +61,21 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
};
/**
- * The first unhandled exception thrown by a worker thread. We save it and re-throw it from
- * the main thread to detect bugs faster; otherwise worker threads just quietly die.
+ * The most severe unhandled exception thrown by a worker thread, according to
+ * {@link #errorClassifier}. This exception gets propagated to the calling thread of
+ * {@link #awaitQuiescence} . We use the most severe error for the sake of not masking e.g.
+ * crashes in worker threads after the first critical error that can occur due to race conditions
+ * in client code.
*
- * Field updates happen only in blocks that are synchronized on the {@link
- * AbstractQueueVisitor} object; it's important to save the first one as it may be more
- * informative than a subsequent one, and this is not a performance-critical path.
+ * <p>Field updates happen only in blocks that are synchronized on the {@link
+ * AbstractQueueVisitor} object.
+ *
+ * <p>If {@link AbstractQueueVisitor} clients don't like the semantics of storing and propagating
+ * the most severe error, then they should be provide an {@link ErrorClassifier} that does the
+ * right thing (e.g. to cause the _first_ error to be propagated, you'd want to provide an
+ * {@link ErrorClassifier} that gives all errors the exact same {@link ErrorClassification}).
+ *
+ * <p>Note that this is not a performance-critical path.
*/
private volatile Throwable unhandled = null;
@@ -413,6 +422,43 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
executorService.execute(runnable);
}
+ private synchronized void maybeSaveUnhandledThrowable(Throwable e, boolean markToStopJobs) {
+ boolean critical = false;
+ ErrorClassification errorClassification = errorClassifier.classify(e);
+ switch (errorClassification) {
+ case AS_CRITICAL_AS_POSSIBLE:
+ case CRITICAL_AND_LOG:
+ critical = true;
+ LOG.log(Level.WARNING, "Found critical error in queue visitor", e);
+ break;
+ case CRITICAL:
+ critical = true;
+ break;
+ default:
+ break;
+ }
+ errorHandler.handle(e, errorClassification);
+ if (unhandled == null
+ || errorClassification.compareTo(errorClassifier.classify(unhandled)) > 0) {
+ // Save the most severe error.
+ unhandled = e;
+ exceptionLatch.countDown();
+ }
+ if (markToStopJobs) {
+ synchronized (zeroRemainingTasks) {
+ if (critical && !jobsMustBeStopped) {
+ jobsMustBeStopped = true;
+ // This introduces a benign race, but it's the best we can do. When we have multiple
+ // errors of the same severity that is at least CRITICAL, we'll end up saving (above) and
+ // propagating (in 'awaitQuiescence') the most severe one we see, but the set of errors we
+ // see is non-deterministic and is at the mercy of how quickly the calling thread of
+ // 'awaitQuiescence' can do its thing after this 'notify' call.
+ zeroRemainingTasks.notify();
+ }
+ }
+ }
+ }
+
private void recordError(Throwable e) {
try {
// If threadInterrupted is true, then RejectedExecutionExceptions are expected. There's no
@@ -422,12 +468,7 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
return;
}
catastrophe = e;
- synchronized (this) {
- if (unhandled == null) { // save only the first one.
- unhandled = e;
- exceptionLatch.countDown();
- }
- }
+ maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ false);
} finally {
decrementRemainingTasks();
}
@@ -438,13 +479,12 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
* <ul>
* <li>Sets {@link #run} to {@code true} when {@code WrappedRunnable} is run,
* <li>Records the thread evaluating {@code r} in {@link #jobs} while {@code r} is evaluated,
- * <li>Prevents {@param runnable} from being invoked if {@link #blockNewActions} returns
+ * <li>Prevents {@link #originalRunnable} from being invoked if {@link #blockNewActions} returns
* {@code true},
* <li>Synchronously invokes {@code runnable.run()},
- * <li>Catches any {@link Throwable} thrown by {@code runnable.run()}, and if it is the first
- * {@link Throwable} seen by this {@link AbstractQueueVisitor}, assigns it to {@link
- * #unhandled}, and calls {@link #markToStopAllJobsIfNeeded} to set {@link #jobsMustBeStopped}
- * if necessary,
+ * <li>Catches any {@link Throwable} thrown by {@code runnable.run()}, and if it is the most
+ * severe {@link Throwable} seen by this {@link AbstractQueueVisitor}, assigns it to
+ * {@link #unhandled}, and sets {@link #jobsMustBeStopped} if necessary,
* <li>And, lastly, calls {@link #decrementRemainingTasks}.
* </ul>
*/
@@ -473,13 +513,7 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
}
originalRunnable.run();
} catch (Throwable e) {
- synchronized (AbstractQueueVisitor.this) {
- if (unhandled == null) { // save only the first one.
- unhandled = e;
- exceptionLatch.countDown();
- }
- markToStopAllJobsIfNeeded(e);
- }
+ maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ true);
} finally {
try {
if (thread != null && addedJob) {
@@ -635,35 +669,4 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
}
}
}
-
- /**
- * Classifies a {@link Throwable} {@param e} thrown by a job.
- *
- * <p>If it is classified as critical, then this sets the {@link #jobsMustBeStopped} flag to
- * {@code true} which signals {@link #awaitTermination(boolean)} to stop all jobs.
- *
- * <p>Also logs details about {@param e} if it is classified as something that must be logged.
- */
- private void markToStopAllJobsIfNeeded(Throwable e) {
- boolean critical = false;
- ErrorClassification errorClassification = errorClassifier.classify(e);
- switch (errorClassification) {
- case CRITICAL_AND_LOG:
- critical = true;
- LOG.log(Level.WARNING, "Found critical error in queue visitor", e);
- break;
- case CRITICAL:
- critical = true;
- break;
- default:
- break;
- }
- errorHandler.handle(e, errorClassification);
- synchronized (zeroRemainingTasks) {
- if (critical && !jobsMustBeStopped) {
- jobsMustBeStopped = true;
- zeroRemainingTasks.notify();
- }
- }
- }
}
diff --git a/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java b/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java
index 5a304198a9..2e832c1fd8 100644
--- a/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java
+++ b/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java
@@ -18,14 +18,20 @@ import com.google.devtools.build.lib.util.Preconditions;
/** A classifier for {@link Error}s and {@link Exception}s. Used by {@link AbstractQueueVisitor}. */
public abstract class ErrorClassifier {
- /** Classification of an error thrown by an action. */
+ /**
+ * Classification of an error thrown by an action.
+ *
+ * <p>N.B. - These enum values are ordered from least severe to most severe.
+ */
public enum ErrorClassification {
+ /** Other running actions should be left alone.*/
+ NOT_CRITICAL,
/** All running actions should be stopped.*/
CRITICAL,
- /** Same as CRITICAL, but also log the error.*/
+ /** Same as {@link #CRITICAL}, but also log the error.*/
CRITICAL_AND_LOG,
- /** Other running actions should be left alone.*/
- NOT_CRITICAL
+ /** Same as {@link #CRITICAL_AND_LOG}, but is even worse. */
+ AS_CRITICAL_AS_POSSIBLE
}
/** Always treat exceptions as {@code NOT_CRITICAL}. */
@@ -39,19 +45,19 @@ public abstract class ErrorClassifier {
/**
* Used by {@link #classify} to classify {@link Exception}s. (Note that {@link Error}s
- * are always classified as {@code CRITICAL_AND_LOG}.)
+ * are always classified as {@code AS_CRITICAL_AS_POSSIBLE}.)
*
* @param e the exception object to check
*/
protected abstract ErrorClassification classifyException(Exception e);
/**
- * Classify {@param e}. If {@code e} is an {@link Error}, it will be classified as {@code
- * CRITICAL_AND_LOG}. Otherwise, calls {@link #classifyException}.
+ * Classify {@code e}. If {@code e} is an {@link Error}, it will be classified as
+ * {@code AS_CRITICAL_AS_POSSIBLE}. Otherwise, calls {@link #classifyException}.
*/
public final ErrorClassification classify(Throwable e) {
if (e instanceof Error) {
- return ErrorClassification.CRITICAL_AND_LOG;
+ return ErrorClassification.AS_CRITICAL_AS_POSSIBLE;
}
Preconditions.checkArgument(e instanceof Exception, e);
return classifyException((Exception) e);
diff --git a/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java b/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java
index 2bbf62d175..7353e5425c 100644
--- a/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java
+++ b/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java
@@ -735,6 +735,10 @@ public final class ParallelEvaluator implements Evaluator {
return ErrorClassification.CRITICAL;
}
if (e instanceof RuntimeException) {
+ // We treat non-SchedulerException RuntimeExceptions as more severe than
+ // SchedulerExceptions so that AbstractQueueVisitor will propagate instances of the
+ // former. They indicate actual Blaze bugs, rather than normal Skyframe evaluation
+ // control flow.
return ErrorClassification.CRITICAL_AND_LOG;
}
return ErrorClassification.NOT_CRITICAL;
diff --git a/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java b/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java
index 7a3982ed12..8f6d389b98 100644
--- a/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java
+++ b/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java
@@ -34,6 +34,7 @@ import org.junit.runners.JUnit4;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
+import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
@@ -408,7 +409,7 @@ public class AbstractQueueVisitorTest {
@Override
public void handle(Throwable t, ErrorClassification classification) {
if (t == error) {
- assertThat(classification).isEqualTo(ErrorClassification.CRITICAL_AND_LOG);
+ assertThat(classification).isEqualTo(ErrorClassification.AS_CRITICAL_AS_POSSIBLE);
criticalErrorSeen.compareAndSet(false, true);
} else {
fail();
@@ -459,6 +460,81 @@ public class AbstractQueueVisitorTest {
assertTrue(criticalErrorSeen.get());
}
+ private static class ClassifiedException extends RuntimeException {
+ private final ErrorClassification classification;
+
+ private ClassifiedException(ErrorClassification classification) {
+ this.classification = classification;
+ }
+ }
+
+ @Test
+ public void mostSevereErrorPropagated() throws Exception {
+ ThreadPoolExecutor executor = new ThreadPoolExecutor(2, 2, 0, TimeUnit.SECONDS,
+ new LinkedBlockingQueue<Runnable>());
+ final Set<Throwable> seenErrors = Sets.newConcurrentHashSet();
+ final ClassifiedException criticalException =
+ new ClassifiedException(ErrorClassification.CRITICAL);
+ final ClassifiedException criticalAndLogException =
+ new ClassifiedException(ErrorClassification.CRITICAL_AND_LOG);
+ final ErrorClassifier errorClassifier = new ErrorClassifier() {
+ @Override
+ protected ErrorClassification classifyException(Exception e) {
+ return (e instanceof ClassifiedException)
+ ? ((ClassifiedException) e).classification
+ : ErrorClassification.NOT_CRITICAL;
+ }
+ };
+ ErrorHandler errorHandler = new ErrorHandler() {
+ @Override
+ public void handle(Throwable t, ErrorClassification classification) {
+ assertThat(classification).isEqualTo(errorClassifier.classify(t));
+ seenErrors.add(t);
+ }
+ };
+ AbstractQueueVisitor visitor =
+ new AbstractQueueVisitor(
+ /*concurrent=*/ true,
+ executor,
+ /*shutdownOnCompletion=*/ true,
+ /*failFastOnException=*/ false,
+ errorClassifier,
+ errorHandler);
+ final CountDownLatch exnLatch = visitor.getExceptionLatchForTestingOnly();
+ Runnable criticalExceptionRunnable = new Runnable() {
+ @Override
+ public void run() {
+ throw criticalException;
+ }
+ };
+ Runnable criticalAndLogExceptionRunnable = new Runnable() {
+ @Override
+ public void run() {
+ // Wait for the critical exception to be thrown. There's a benign race between our 'await'
+ // call completing because the exception latch was counted down, and our thread being
+ // interrupted by AbstractQueueVisitor because the critical error was encountered. This is
+ // completely fine; all that matters is that we have a chance to throw our error _after_
+ // the previous one was thrown by the other Runnable.
+ try {
+ exnLatch.await();
+ } catch (InterruptedException e) {
+ // Ignored.
+ }
+ throw criticalAndLogException;
+ }
+ };
+ visitor.execute(criticalExceptionRunnable);
+ visitor.execute(criticalAndLogExceptionRunnable);
+ ClassifiedException exn = null;
+ try {
+ visitor.awaitQuiescence(/*interruptWorkers=*/ true);
+ } catch (ClassifiedException e) {
+ exn = e;
+ }
+ assertEquals(criticalAndLogException, exn);
+ assertThat(seenErrors).containsExactly(criticalException, criticalAndLogException);
+ }
+
private static Runnable throwingRunnable() {
return new Runnable() {
@Override
diff --git a/src/test/java/com/google/devtools/build/lib/concurrent/ErrorClassifierTest.java b/src/test/java/com/google/devtools/build/lib/concurrent/ErrorClassifierTest.java
new file mode 100644
index 0000000000..7c06270731
--- /dev/null
+++ b/src/test/java/com/google/devtools/build/lib/concurrent/ErrorClassifierTest.java
@@ -0,0 +1,38 @@
+// Copyright 2016 The Bazel Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package com.google.devtools.build.lib.concurrent;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import com.google.devtools.build.lib.concurrent.ErrorClassifier.ErrorClassification;
+import java.util.Arrays;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Tests for {@link ErrorClassifier}. */
+@RunWith(JUnit4.class)
+public class ErrorClassifierTest {
+ @Test
+ public void testErrorClassificationNaturalOrder() {
+ ErrorClassification[] values = ErrorClassification.values();
+ Arrays.sort(values);
+ assertThat(values).asList().containsExactly(
+ ErrorClassification.NOT_CRITICAL,
+ ErrorClassification.CRITICAL,
+ ErrorClassification.CRITICAL_AND_LOG,
+ ErrorClassification.AS_CRITICAL_AS_POSSIBLE).inOrder();
+ }
+}
+