aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/main/java/com/google/devtools/build/lib/concurrent
diff options
context:
space:
mode:
authorGravatar Nathan Harmata <nharmata@google.com>2016-08-11 23:11:33 +0000
committerGravatar Yue Gan <yueg@google.com>2016-08-12 08:54:09 +0000
commit2c5d51712814d0e1f624364a013fc21795db3eda (patch)
tree6758ea76e926ebe7596fab52c7cbd397349ada62 /src/main/java/com/google/devtools/build/lib/concurrent
parentd5353257a835631a83c25efff3b5560b5bbce7e7 (diff)
Have AQV propagate the most severe error encountered by any of its tasks. Make sure that ParallelEvaluator treats SchedulerExceptions as less severe than other RuntimeExceptions (it already did, but add a comment emphasizing this).
The combination of the above means that we don't ignore hypothetical crashes in ParallelEvaluator worker threads _after_ a SchedulerException (e.g. due to an error in nokeep_going evaluation). -- MOS_MIGRATED_REVID=130044230
Diffstat (limited to 'src/main/java/com/google/devtools/build/lib/concurrent')
-rw-r--r--src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java111
-rw-r--r--src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java22
2 files changed, 71 insertions, 62 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java b/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java
index 9db1c22021..baab8847e7 100644
--- a/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java
+++ b/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java
@@ -61,12 +61,21 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
};
/**
- * The first unhandled exception thrown by a worker thread. We save it and re-throw it from
- * the main thread to detect bugs faster; otherwise worker threads just quietly die.
+ * The most severe unhandled exception thrown by a worker thread, according to
+ * {@link #errorClassifier}. This exception gets propagated to the calling thread of
+ * {@link #awaitQuiescence} . We use the most severe error for the sake of not masking e.g.
+ * crashes in worker threads after the first critical error that can occur due to race conditions
+ * in client code.
*
- * Field updates happen only in blocks that are synchronized on the {@link
- * AbstractQueueVisitor} object; it's important to save the first one as it may be more
- * informative than a subsequent one, and this is not a performance-critical path.
+ * <p>Field updates happen only in blocks that are synchronized on the {@link
+ * AbstractQueueVisitor} object.
+ *
+ * <p>If {@link AbstractQueueVisitor} clients don't like the semantics of storing and propagating
+ * the most severe error, then they should be provide an {@link ErrorClassifier} that does the
+ * right thing (e.g. to cause the _first_ error to be propagated, you'd want to provide an
+ * {@link ErrorClassifier} that gives all errors the exact same {@link ErrorClassification}).
+ *
+ * <p>Note that this is not a performance-critical path.
*/
private volatile Throwable unhandled = null;
@@ -413,6 +422,43 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
executorService.execute(runnable);
}
+ private synchronized void maybeSaveUnhandledThrowable(Throwable e, boolean markToStopJobs) {
+ boolean critical = false;
+ ErrorClassification errorClassification = errorClassifier.classify(e);
+ switch (errorClassification) {
+ case AS_CRITICAL_AS_POSSIBLE:
+ case CRITICAL_AND_LOG:
+ critical = true;
+ LOG.log(Level.WARNING, "Found critical error in queue visitor", e);
+ break;
+ case CRITICAL:
+ critical = true;
+ break;
+ default:
+ break;
+ }
+ errorHandler.handle(e, errorClassification);
+ if (unhandled == null
+ || errorClassification.compareTo(errorClassifier.classify(unhandled)) > 0) {
+ // Save the most severe error.
+ unhandled = e;
+ exceptionLatch.countDown();
+ }
+ if (markToStopJobs) {
+ synchronized (zeroRemainingTasks) {
+ if (critical && !jobsMustBeStopped) {
+ jobsMustBeStopped = true;
+ // This introduces a benign race, but it's the best we can do. When we have multiple
+ // errors of the same severity that is at least CRITICAL, we'll end up saving (above) and
+ // propagating (in 'awaitQuiescence') the most severe one we see, but the set of errors we
+ // see is non-deterministic and is at the mercy of how quickly the calling thread of
+ // 'awaitQuiescence' can do its thing after this 'notify' call.
+ zeroRemainingTasks.notify();
+ }
+ }
+ }
+ }
+
private void recordError(Throwable e) {
try {
// If threadInterrupted is true, then RejectedExecutionExceptions are expected. There's no
@@ -422,12 +468,7 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
return;
}
catastrophe = e;
- synchronized (this) {
- if (unhandled == null) { // save only the first one.
- unhandled = e;
- exceptionLatch.countDown();
- }
- }
+ maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ false);
} finally {
decrementRemainingTasks();
}
@@ -438,13 +479,12 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
* <ul>
* <li>Sets {@link #run} to {@code true} when {@code WrappedRunnable} is run,
* <li>Records the thread evaluating {@code r} in {@link #jobs} while {@code r} is evaluated,
- * <li>Prevents {@param runnable} from being invoked if {@link #blockNewActions} returns
+ * <li>Prevents {@link #originalRunnable} from being invoked if {@link #blockNewActions} returns
* {@code true},
* <li>Synchronously invokes {@code runnable.run()},
- * <li>Catches any {@link Throwable} thrown by {@code runnable.run()}, and if it is the first
- * {@link Throwable} seen by this {@link AbstractQueueVisitor}, assigns it to {@link
- * #unhandled}, and calls {@link #markToStopAllJobsIfNeeded} to set {@link #jobsMustBeStopped}
- * if necessary,
+ * <li>Catches any {@link Throwable} thrown by {@code runnable.run()}, and if it is the most
+ * severe {@link Throwable} seen by this {@link AbstractQueueVisitor}, assigns it to
+ * {@link #unhandled}, and sets {@link #jobsMustBeStopped} if necessary,
* <li>And, lastly, calls {@link #decrementRemainingTasks}.
* </ul>
*/
@@ -473,13 +513,7 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
}
originalRunnable.run();
} catch (Throwable e) {
- synchronized (AbstractQueueVisitor.this) {
- if (unhandled == null) { // save only the first one.
- unhandled = e;
- exceptionLatch.countDown();
- }
- markToStopAllJobsIfNeeded(e);
- }
+ maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ true);
} finally {
try {
if (thread != null && addedJob) {
@@ -635,35 +669,4 @@ public class AbstractQueueVisitor implements QuiescingExecutor {
}
}
}
-
- /**
- * Classifies a {@link Throwable} {@param e} thrown by a job.
- *
- * <p>If it is classified as critical, then this sets the {@link #jobsMustBeStopped} flag to
- * {@code true} which signals {@link #awaitTermination(boolean)} to stop all jobs.
- *
- * <p>Also logs details about {@param e} if it is classified as something that must be logged.
- */
- private void markToStopAllJobsIfNeeded(Throwable e) {
- boolean critical = false;
- ErrorClassification errorClassification = errorClassifier.classify(e);
- switch (errorClassification) {
- case CRITICAL_AND_LOG:
- critical = true;
- LOG.log(Level.WARNING, "Found critical error in queue visitor", e);
- break;
- case CRITICAL:
- critical = true;
- break;
- default:
- break;
- }
- errorHandler.handle(e, errorClassification);
- synchronized (zeroRemainingTasks) {
- if (critical && !jobsMustBeStopped) {
- jobsMustBeStopped = true;
- zeroRemainingTasks.notify();
- }
- }
- }
}
diff --git a/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java b/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java
index 5a304198a9..2e832c1fd8 100644
--- a/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java
+++ b/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java
@@ -18,14 +18,20 @@ import com.google.devtools.build.lib.util.Preconditions;
/** A classifier for {@link Error}s and {@link Exception}s. Used by {@link AbstractQueueVisitor}. */
public abstract class ErrorClassifier {
- /** Classification of an error thrown by an action. */
+ /**
+ * Classification of an error thrown by an action.
+ *
+ * <p>N.B. - These enum values are ordered from least severe to most severe.
+ */
public enum ErrorClassification {
+ /** Other running actions should be left alone.*/
+ NOT_CRITICAL,
/** All running actions should be stopped.*/
CRITICAL,
- /** Same as CRITICAL, but also log the error.*/
+ /** Same as {@link #CRITICAL}, but also log the error.*/
CRITICAL_AND_LOG,
- /** Other running actions should be left alone.*/
- NOT_CRITICAL
+ /** Same as {@link #CRITICAL_AND_LOG}, but is even worse. */
+ AS_CRITICAL_AS_POSSIBLE
}
/** Always treat exceptions as {@code NOT_CRITICAL}. */
@@ -39,19 +45,19 @@ public abstract class ErrorClassifier {
/**
* Used by {@link #classify} to classify {@link Exception}s. (Note that {@link Error}s
- * are always classified as {@code CRITICAL_AND_LOG}.)
+ * are always classified as {@code AS_CRITICAL_AS_POSSIBLE}.)
*
* @param e the exception object to check
*/
protected abstract ErrorClassification classifyException(Exception e);
/**
- * Classify {@param e}. If {@code e} is an {@link Error}, it will be classified as {@code
- * CRITICAL_AND_LOG}. Otherwise, calls {@link #classifyException}.
+ * Classify {@code e}. If {@code e} is an {@link Error}, it will be classified as
+ * {@code AS_CRITICAL_AS_POSSIBLE}. Otherwise, calls {@link #classifyException}.
*/
public final ErrorClassification classify(Throwable e) {
if (e instanceof Error) {
- return ErrorClassification.CRITICAL_AND_LOG;
+ return ErrorClassification.AS_CRITICAL_AS_POSSIBLE;
}
Preconditions.checkArgument(e instanceof Exception, e);
return classifyException((Exception) e);