diff options
author | 2016-08-11 23:11:33 +0000 | |
---|---|---|
committer | 2016-08-12 08:54:09 +0000 | |
commit | 2c5d51712814d0e1f624364a013fc21795db3eda (patch) | |
tree | 6758ea76e926ebe7596fab52c7cbd397349ada62 /src/main/java/com/google/devtools/build/lib/concurrent | |
parent | d5353257a835631a83c25efff3b5560b5bbce7e7 (diff) |
Have AQV propagate the most severe error encountered by any of its tasks. Make sure that ParallelEvaluator treats SchedulerExceptions as less severe than other RuntimeExceptions (it already did, but add a comment emphasizing this).
The combination of the above means that we don't ignore hypothetical crashes in ParallelEvaluator worker threads _after_ a SchedulerException (e.g. due to an error in nokeep_going evaluation).
--
MOS_MIGRATED_REVID=130044230
Diffstat (limited to 'src/main/java/com/google/devtools/build/lib/concurrent')
-rw-r--r-- | src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java | 111 | ||||
-rw-r--r-- | src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java | 22 |
2 files changed, 71 insertions, 62 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java b/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java index 9db1c22021..baab8847e7 100644 --- a/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java +++ b/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java @@ -61,12 +61,21 @@ public class AbstractQueueVisitor implements QuiescingExecutor { }; /** - * The first unhandled exception thrown by a worker thread. We save it and re-throw it from - * the main thread to detect bugs faster; otherwise worker threads just quietly die. + * The most severe unhandled exception thrown by a worker thread, according to + * {@link #errorClassifier}. This exception gets propagated to the calling thread of + * {@link #awaitQuiescence} . We use the most severe error for the sake of not masking e.g. + * crashes in worker threads after the first critical error that can occur due to race conditions + * in client code. * - * Field updates happen only in blocks that are synchronized on the {@link - * AbstractQueueVisitor} object; it's important to save the first one as it may be more - * informative than a subsequent one, and this is not a performance-critical path. + * <p>Field updates happen only in blocks that are synchronized on the {@link + * AbstractQueueVisitor} object. + * + * <p>If {@link AbstractQueueVisitor} clients don't like the semantics of storing and propagating + * the most severe error, then they should be provide an {@link ErrorClassifier} that does the + * right thing (e.g. to cause the _first_ error to be propagated, you'd want to provide an + * {@link ErrorClassifier} that gives all errors the exact same {@link ErrorClassification}). + * + * <p>Note that this is not a performance-critical path. */ private volatile Throwable unhandled = null; @@ -413,6 +422,43 @@ public class AbstractQueueVisitor implements QuiescingExecutor { executorService.execute(runnable); } + private synchronized void maybeSaveUnhandledThrowable(Throwable e, boolean markToStopJobs) { + boolean critical = false; + ErrorClassification errorClassification = errorClassifier.classify(e); + switch (errorClassification) { + case AS_CRITICAL_AS_POSSIBLE: + case CRITICAL_AND_LOG: + critical = true; + LOG.log(Level.WARNING, "Found critical error in queue visitor", e); + break; + case CRITICAL: + critical = true; + break; + default: + break; + } + errorHandler.handle(e, errorClassification); + if (unhandled == null + || errorClassification.compareTo(errorClassifier.classify(unhandled)) > 0) { + // Save the most severe error. + unhandled = e; + exceptionLatch.countDown(); + } + if (markToStopJobs) { + synchronized (zeroRemainingTasks) { + if (critical && !jobsMustBeStopped) { + jobsMustBeStopped = true; + // This introduces a benign race, but it's the best we can do. When we have multiple + // errors of the same severity that is at least CRITICAL, we'll end up saving (above) and + // propagating (in 'awaitQuiescence') the most severe one we see, but the set of errors we + // see is non-deterministic and is at the mercy of how quickly the calling thread of + // 'awaitQuiescence' can do its thing after this 'notify' call. + zeroRemainingTasks.notify(); + } + } + } + } + private void recordError(Throwable e) { try { // If threadInterrupted is true, then RejectedExecutionExceptions are expected. There's no @@ -422,12 +468,7 @@ public class AbstractQueueVisitor implements QuiescingExecutor { return; } catastrophe = e; - synchronized (this) { - if (unhandled == null) { // save only the first one. - unhandled = e; - exceptionLatch.countDown(); - } - } + maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ false); } finally { decrementRemainingTasks(); } @@ -438,13 +479,12 @@ public class AbstractQueueVisitor implements QuiescingExecutor { * <ul> * <li>Sets {@link #run} to {@code true} when {@code WrappedRunnable} is run, * <li>Records the thread evaluating {@code r} in {@link #jobs} while {@code r} is evaluated, - * <li>Prevents {@param runnable} from being invoked if {@link #blockNewActions} returns + * <li>Prevents {@link #originalRunnable} from being invoked if {@link #blockNewActions} returns * {@code true}, * <li>Synchronously invokes {@code runnable.run()}, - * <li>Catches any {@link Throwable} thrown by {@code runnable.run()}, and if it is the first - * {@link Throwable} seen by this {@link AbstractQueueVisitor}, assigns it to {@link - * #unhandled}, and calls {@link #markToStopAllJobsIfNeeded} to set {@link #jobsMustBeStopped} - * if necessary, + * <li>Catches any {@link Throwable} thrown by {@code runnable.run()}, and if it is the most + * severe {@link Throwable} seen by this {@link AbstractQueueVisitor}, assigns it to + * {@link #unhandled}, and sets {@link #jobsMustBeStopped} if necessary, * <li>And, lastly, calls {@link #decrementRemainingTasks}. * </ul> */ @@ -473,13 +513,7 @@ public class AbstractQueueVisitor implements QuiescingExecutor { } originalRunnable.run(); } catch (Throwable e) { - synchronized (AbstractQueueVisitor.this) { - if (unhandled == null) { // save only the first one. - unhandled = e; - exceptionLatch.countDown(); - } - markToStopAllJobsIfNeeded(e); - } + maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ true); } finally { try { if (thread != null && addedJob) { @@ -635,35 +669,4 @@ public class AbstractQueueVisitor implements QuiescingExecutor { } } } - - /** - * Classifies a {@link Throwable} {@param e} thrown by a job. - * - * <p>If it is classified as critical, then this sets the {@link #jobsMustBeStopped} flag to - * {@code true} which signals {@link #awaitTermination(boolean)} to stop all jobs. - * - * <p>Also logs details about {@param e} if it is classified as something that must be logged. - */ - private void markToStopAllJobsIfNeeded(Throwable e) { - boolean critical = false; - ErrorClassification errorClassification = errorClassifier.classify(e); - switch (errorClassification) { - case CRITICAL_AND_LOG: - critical = true; - LOG.log(Level.WARNING, "Found critical error in queue visitor", e); - break; - case CRITICAL: - critical = true; - break; - default: - break; - } - errorHandler.handle(e, errorClassification); - synchronized (zeroRemainingTasks) { - if (critical && !jobsMustBeStopped) { - jobsMustBeStopped = true; - zeroRemainingTasks.notify(); - } - } - } } diff --git a/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java b/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java index 5a304198a9..2e832c1fd8 100644 --- a/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java +++ b/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java @@ -18,14 +18,20 @@ import com.google.devtools.build.lib.util.Preconditions; /** A classifier for {@link Error}s and {@link Exception}s. Used by {@link AbstractQueueVisitor}. */ public abstract class ErrorClassifier { - /** Classification of an error thrown by an action. */ + /** + * Classification of an error thrown by an action. + * + * <p>N.B. - These enum values are ordered from least severe to most severe. + */ public enum ErrorClassification { + /** Other running actions should be left alone.*/ + NOT_CRITICAL, /** All running actions should be stopped.*/ CRITICAL, - /** Same as CRITICAL, but also log the error.*/ + /** Same as {@link #CRITICAL}, but also log the error.*/ CRITICAL_AND_LOG, - /** Other running actions should be left alone.*/ - NOT_CRITICAL + /** Same as {@link #CRITICAL_AND_LOG}, but is even worse. */ + AS_CRITICAL_AS_POSSIBLE } /** Always treat exceptions as {@code NOT_CRITICAL}. */ @@ -39,19 +45,19 @@ public abstract class ErrorClassifier { /** * Used by {@link #classify} to classify {@link Exception}s. (Note that {@link Error}s - * are always classified as {@code CRITICAL_AND_LOG}.) + * are always classified as {@code AS_CRITICAL_AS_POSSIBLE}.) * * @param e the exception object to check */ protected abstract ErrorClassification classifyException(Exception e); /** - * Classify {@param e}. If {@code e} is an {@link Error}, it will be classified as {@code - * CRITICAL_AND_LOG}. Otherwise, calls {@link #classifyException}. + * Classify {@code e}. If {@code e} is an {@link Error}, it will be classified as + * {@code AS_CRITICAL_AS_POSSIBLE}. Otherwise, calls {@link #classifyException}. */ public final ErrorClassification classify(Throwable e) { if (e instanceof Error) { - return ErrorClassification.CRITICAL_AND_LOG; + return ErrorClassification.AS_CRITICAL_AS_POSSIBLE; } Preconditions.checkArgument(e instanceof Exception, e); return classifyException((Exception) e); |