aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Nathan Harmata <nharmata@google.com>2016-08-11 23:11:33 +0000
committerGravatar Yue Gan <yueg@google.com>2016-08-12 08:54:09 +0000
commit2c5d51712814d0e1f624364a013fc21795db3eda (patch)
tree6758ea76e926ebe7596fab52c7cbd397349ada62
parentd5353257a835631a83c25efff3b5560b5bbce7e7 (diff)
Have AQV propagate the most severe error encountered by any of its tasks. Make sure that ParallelEvaluator treats SchedulerExceptions as less severe than other RuntimeExceptions (it already did, but add a comment emphasizing this).
The combination of the above means that we don't ignore hypothetical crashes in ParallelEvaluator worker threads _after_ a SchedulerException (e.g. due to an error in nokeep_going evaluation). -- MOS_MIGRATED_REVID=130044230
-rw-r--r--src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java111
-rw-r--r--src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java22
-rw-r--r--src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java4
-rw-r--r--src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java78
-rw-r--r--src/test/java/com/google/devtools/build/lib/concurrent/ErrorClassifierTest.java38
5 files changed, 190 insertions, 63 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java b/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java
index 9db1c22021..baab8847e7 100644
--- a/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java
+++ b/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java
@@ -61,12 +61,21 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
};
/**
- * The first unhandled exception thrown by a worker thread. We save it and re-throw it from
- * the main thread to detect bugs faster; otherwise worker threads just quietly die.
+ * The most severe unhandled exception thrown by a worker thread, according to
+ * {@link #errorClassifier}. This exception gets propagated to the calling thread of
+ * {@link #awaitQuiescence} . We use the most severe error for the sake of not masking e.g.
+ * crashes in worker threads after the first critical error that can occur due to race conditions
+ * in client code.
*
- * Field updates happen only in blocks that are synchronized on the {@link
- * AbstractQueueVisitor} object; it's important to save the first one as it may be more
- * informative than a subsequent one, and this is not a performance-critical path.
+ * <p>Field updates happen only in blocks that are synchronized on the {@link
+ * AbstractQueueVisitor} object.
+ *
+ * <p>If {@link AbstractQueueVisitor} clients don't like the semantics of storing and propagating
+ * the most severe error, then they should be provide an {@link ErrorClassifier} that does the
+ * right thing (e.g. to cause the _first_ error to be propagated, you'd want to provide an
+ * {@link ErrorClassifier} that gives all errors the exact same {@link ErrorClassification}).
+ *
+ * <p>Note that this is not a performance-critical path.
*/
private volatile Throwable unhandled = null;
@@ -413,6 +422,43 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
executorService.execute(runnable);
}
+ private synchronized void maybeSaveUnhandledThrowable(Throwable e, boolean markToStopJobs) {
+ boolean critical = false;
+ ErrorClassification errorClassification = errorClassifier.classify(e);
+ switch (errorClassification) {
+ case AS_CRITICAL_AS_POSSIBLE:
+ case CRITICAL_AND_LOG:
+ critical = true;
+ LOG.log(Level.WARNING, "Found critical error in queue visitor", e);
+ break;
+ case CRITICAL:
+ critical = true;
+ break;
+ default:
+ break;
+ }
+ errorHandler.handle(e, errorClassification);
+ if (unhandled == null
+ || errorClassification.compareTo(errorClassifier.classify(unhandled)) > 0) {
+ // Save the most severe error.
+ unhandled = e;
+ exceptionLatch.countDown();
+ }
+ if (markToStopJobs) {
+ synchronized (zeroRemainingTasks) {
+ if (critical && !jobsMustBeStopped) {
+ jobsMustBeStopped = true;
+ // This introduces a benign race, but it's the best we can do. When we have multiple
+ // errors of the same severity that is at least CRITICAL, we'll end up saving (above) and
+ // propagating (in 'awaitQuiescence') the most severe one we see, but the set of errors we
+ // see is non-deterministic and is at the mercy of how quickly the calling thread of
+ // 'awaitQuiescence' can do its thing after this 'notify' call.
+ zeroRemainingTasks.notify();
+ }
+ }
+ }
+ }
+
private void recordError(Throwable e) {
try {
// If threadInterrupted is true, then RejectedExecutionExceptions are expected. There's no
@@ -422,12 +468,7 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
return;
}
catastrophe = e;
- synchronized (this) {
- if (unhandled == null) { // save only the first one.
- unhandled = e;
- exceptionLatch.countDown();
- }
- }
+ maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ false);
} finally {
decrementRemainingTasks();
}
@@ -438,13 +479,12 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
* <ul>
* <li>Sets {@link #run} to {@code true} when {@code WrappedRunnable} is run,
* <li>Records the thread evaluating {@code r} in {@link #jobs} while {@code r} is evaluated,
- * <li>Prevents {@param runnable} from being invoked if {@link #blockNewActions} returns
+ * <li>Prevents {@link #originalRunnable} from being invoked if {@link #blockNewActions} returns
* {@code true},
* <li>Synchronously invokes {@code runnable.run()},
- * <li>Catches any {@link Throwable} thrown by {@code runnable.run()}, and if it is the first
- * {@link Throwable} seen by this {@link AbstractQueueVisitor}, assigns it to {@link
- * #unhandled}, and calls {@link #markToStopAllJobsIfNeeded} to set {@link #jobsMustBeStopped}
- * if necessary,
+ * <li>Catches any {@link Throwable} thrown by {@code runnable.run()}, and if it is the most
+ * severe {@link Throwable} seen by this {@link AbstractQueueVisitor}, assigns it to
+ * {@link #unhandled}, and sets {@link #jobsMustBeStopped} if necessary,
* <li>And, lastly, calls {@link #decrementRemainingTasks}.
* </ul>
*/
@@ -473,13 +513,7 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
}
originalRunnable.run();
} catch (Throwable e) {
- synchronized (AbstractQueueVisitor.this) {
- if (unhandled == null) { // save only the first one.
- unhandled = e;
- exceptionLatch.countDown();
- }
- markToStopAllJobsIfNeeded(e);
- }
+ maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ true);
} finally {
try {
if (thread != null && addedJob) {
@@ -635,35 +669,4 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
}
}
}
-
- /**
- * Classifies a {@link Throwable} {@param e} thrown by a job.
- *
- * <p>If it is classified as critical, then this sets the {@link #jobsMustBeStopped} flag to
- * {@code true} which signals {@link #awaitTermination(boolean)} to stop all jobs.
- *
- * <p>Also logs details about {@param e} if it is classified as something that must be logged.
- */
- private void markToStopAllJobsIfNeeded(Throwable e) {
- boolean critical = false;
- ErrorClassification errorClassification = errorClassifier.classify(e);
- switch (errorClassification) {
- case CRITICAL_AND_LOG:
- critical = true;
- LOG.log(Level.WARNING, "Found critical error in queue visitor", e);
- break;
- case CRITICAL:
- critical = true;
- break;
- default:
- break;
- }
- errorHandler.handle(e, errorClassification);
- synchronized (zeroRemainingTasks) {
- if (critical && !jobsMustBeStopped) {
- jobsMustBeStopped = true;
- zeroRemainingTasks.notify();
- }
- }
- }
}
diff --git a/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java b/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java
index 5a304198a9..2e832c1fd8 100644
--- a/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java
+++ b/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java
@@ -18,14 +18,20 @@ import com.google.devtools.build.lib.util.Preconditions;
/** A classifier for {@link Error}s and {@link Exception}s. Used by {@link AbstractQueueVisitor}. */
public abstract class ErrorClassifier {
- /** Classification of an error thrown by an action. */
+ /**
+ * Classification of an error thrown by an action.
+ *
+ * <p>N.B. - These enum values are ordered from least severe to most severe.
+ */
public enum ErrorClassification {
+ /** Other running actions should be left alone.*/
+ NOT_CRITICAL,
/** All running actions should be stopped.*/
CRITICAL,
- /** Same as CRITICAL, but also log the error.*/
+ /** Same as {@link #CRITICAL}, but also log the error.*/
CRITICAL_AND_LOG,
- /** Other running actions should be left alone.*/
- NOT_CRITICAL
+ /** Same as {@link #CRITICAL_AND_LOG}, but is even worse. */
+ AS_CRITICAL_AS_POSSIBLE
}
/** Always treat exceptions as {@code NOT_CRITICAL}. */
@@ -39,19 +45,19 @@ public abstract class ErrorClassifier {
/**
* Used by {@link #classify} to classify {@link Exception}s. (Note that {@link Error}s
- * are always classified as {@code CRITICAL_AND_LOG}.)
+ * are always classified as {@code AS_CRITICAL_AS_POSSIBLE}.)
*
* @param e the exception object to check
*/
protected abstract ErrorClassification classifyException(Exception e);
/**
- * Classify {@param e}. If {@code e} is an {@link Error}, it will be classified as {@code
- * CRITICAL_AND_LOG}. Otherwise, calls {@link #classifyException}.
+ * Classify {@code e}. If {@code e} is an {@link Error}, it will be classified as
+ * {@code AS_CRITICAL_AS_POSSIBLE}. Otherwise, calls {@link #classifyException}.
*/
public final ErrorClassification classify(Throwable e) {
if (e instanceof Error) {
- return ErrorClassification.CRITICAL_AND_LOG;
+ return ErrorClassification.AS_CRITICAL_AS_POSSIBLE;
}
Preconditions.checkArgument(e instanceof Exception, e);
return classifyException((Exception) e);
diff --git a/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java b/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java
index 2bbf62d175..7353e5425c 100644
--- a/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java
+++ b/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java
@@ -735,6 +735,10 @@ public final class ParallelEvaluator implements Evaluator {
return ErrorClassification.CRITICAL;
}
if (e instanceof RuntimeException) {
+ // We treat non-SchedulerException RuntimeExceptions as more severe than
+ // SchedulerExceptions so that AbstractQueueVisitor will propagate instances of the
+ // former. They indicate actual Blaze bugs, rather than normal Skyframe evaluation
+ // control flow.
return ErrorClassification.CRITICAL_AND_LOG;
}
return ErrorClassification.NOT_CRITICAL;
diff --git a/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java b/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java
index 7a3982ed12..8f6d389b98 100644
--- a/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java
+++ b/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java
@@ -34,6 +34,7 @@ import org.junit.runners.JUnit4;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
+import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
@@ -408,7 +409,7 @@ public class AbstractQueueVisitorTest {
@Override
public void handle(Throwable t, ErrorClassification classification) {
if (t == error) {
- assertThat(classification).isEqualTo(ErrorClassification.CRITICAL_AND_LOG);
+ assertThat(classification).isEqualTo(ErrorClassification.AS_CRITICAL_AS_POSSIBLE);
criticalErrorSeen.compareAndSet(false, true);
} else {
fail();
@@ -459,6 +460,81 @@ public class AbstractQueueVisitorTest {
assertTrue(criticalErrorSeen.get());
}
+ private static class ClassifiedException extends RuntimeException {
+ private final ErrorClassification classification;
+
+ private ClassifiedException(ErrorClassification classification) {
+ this.classification = classification;
+ }
+ }
+
+ @Test
+ public void mostSevereErrorPropagated() throws Exception {
+ ThreadPoolExecutor executor = new ThreadPoolExecutor(2, 2, 0, TimeUnit.SECONDS,
+ new LinkedBlockingQueue<Runnable>());
+ final Set<Throwable> seenErrors = Sets.newConcurrentHashSet();
+ final ClassifiedException criticalException =
+ new ClassifiedException(ErrorClassification.CRITICAL);
+ final ClassifiedException criticalAndLogException =
+ new ClassifiedException(ErrorClassification.CRITICAL_AND_LOG);
+ final ErrorClassifier errorClassifier = new ErrorClassifier() {
+ @Override
+ protected ErrorClassification classifyException(Exception e) {
+ return (e instanceof ClassifiedException)
+ ? ((ClassifiedException) e).classification
+ : ErrorClassification.NOT_CRITICAL;
+ }
+ };
+ ErrorHandler errorHandler = new ErrorHandler() {
+ @Override
+ public void handle(Throwable t, ErrorClassification classification) {
+ assertThat(classification).isEqualTo(errorClassifier.classify(t));
+ seenErrors.add(t);
+ }
+ };
+ AbstractQueueVisitor visitor =
+ new AbstractQueueVisitor(
+ /*concurrent=*/ true,
+ executor,
+ /*shutdownOnCompletion=*/ true,
+ /*failFastOnException=*/ false,
+ errorClassifier,
+ errorHandler);
+ final CountDownLatch exnLatch = visitor.getExceptionLatchForTestingOnly();
+ Runnable criticalExceptionRunnable = new Runnable() {
+ @Override
+ public void run() {
+ throw criticalException;
+ }
+ };
+ Runnable criticalAndLogExceptionRunnable = new Runnable() {
+ @Override
+ public void run() {
+ // Wait for the critical exception to be thrown. There's a benign race between our 'await'
+ // call completing because the exception latch was counted down, and our thread being
+ // interrupted by AbstractQueueVisitor because the critical error was encountered. This is
+ // completely fine; all that matters is that we have a chance to throw our error _after_
+ // the previous one was thrown by the other Runnable.
+ try {
+ exnLatch.await();
+ } catch (InterruptedException e) {
+ // Ignored.
+ }
+ throw criticalAndLogException;
+ }
+ };
+ visitor.execute(criticalExceptionRunnable);
+ visitor.execute(criticalAndLogExceptionRunnable);
+ ClassifiedException exn = null;
+ try {
+ visitor.awaitQuiescence(/*interruptWorkers=*/ true);
+ } catch (ClassifiedException e) {
+ exn = e;
+ }
+ assertEquals(criticalAndLogException, exn);
+ assertThat(seenErrors).containsExactly(criticalException, criticalAndLogException);
+ }
+
private static Runnable throwingRunnable() {
return new Runnable() {
@Override
diff --git a/src/test/java/com/google/devtools/build/lib/concurrent/ErrorClassifierTest.java b/src/test/java/com/google/devtools/build/lib/concurrent/ErrorClassifierTest.java
new file mode 100644
index 0000000000..7c06270731
--- /dev/null
+++ b/src/test/java/com/google/devtools/build/lib/concurrent/ErrorClassifierTest.java
@@ -0,0 +1,38 @@
+// Copyright 2016 The Bazel Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package com.google.devtools.build.lib.concurrent;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import com.google.devtools.build.lib.concurrent.ErrorClassifier.ErrorClassification;
+import java.util.Arrays;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Tests for {@link ErrorClassifier}. */
+@RunWith(JUnit4.class)
+public class ErrorClassifierTest {
+ @Test
+ public void testErrorClassificationNaturalOrder() {
+ ErrorClassification[] values = ErrorClassification.values();
+ Arrays.sort(values);
+ assertThat(values).asList().containsExactly(
+ ErrorClassification.NOT_CRITICAL,
+ ErrorClassification.CRITICAL,
+ ErrorClassification.CRITICAL_AND_LOG,
+ ErrorClassification.AS_CRITICAL_AS_POSSIBLE).inOrder();
+ }
+}
+