diff options
author | 2016-08-11 23:11:33 +0000 | |
---|---|---|
committer | 2016-08-12 08:54:09 +0000 | |
commit | 2c5d51712814d0e1f624364a013fc21795db3eda (patch) | |
tree | 6758ea76e926ebe7596fab52c7cbd397349ada62 | |
parent | d5353257a835631a83c25efff3b5560b5bbce7e7 (diff) |
Have AQV propagate the most severe error encountered by any of its tasks. Make sure that ParallelEvaluator treats SchedulerExceptions as less severe than other RuntimeExceptions (it already did, but add a comment emphasizing this).
The combination of the above means that we don't ignore hypothetical crashes in ParallelEvaluator worker threads _after_ a SchedulerException (e.g. due to an error in nokeep_going evaluation).
--
MOS_MIGRATED_REVID=130044230
5 files changed, 190 insertions, 63 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java b/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java index 9db1c22021..baab8847e7 100644 --- a/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java +++ b/src/main/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitor.java @@ -61,12 +61,21 @@ public class AbstractQueueVisitor implements QuiescingExecutor { }; /** - * The first unhandled exception thrown by a worker thread. We save it and re-throw it from - * the main thread to detect bugs faster; otherwise worker threads just quietly die. + * The most severe unhandled exception thrown by a worker thread, according to + * {@link #errorClassifier}. This exception gets propagated to the calling thread of + * {@link #awaitQuiescence} . We use the most severe error for the sake of not masking e.g. + * crashes in worker threads after the first critical error that can occur due to race conditions + * in client code. * - * Field updates happen only in blocks that are synchronized on the {@link - * AbstractQueueVisitor} object; it's important to save the first one as it may be more - * informative than a subsequent one, and this is not a performance-critical path. + * <p>Field updates happen only in blocks that are synchronized on the {@link + * AbstractQueueVisitor} object. + * + * <p>If {@link AbstractQueueVisitor} clients don't like the semantics of storing and propagating + * the most severe error, then they should be provide an {@link ErrorClassifier} that does the + * right thing (e.g. to cause the _first_ error to be propagated, you'd want to provide an + * {@link ErrorClassifier} that gives all errors the exact same {@link ErrorClassification}). + * + * <p>Note that this is not a performance-critical path. */ private volatile Throwable unhandled = null; @@ -413,6 +422,43 @@ public class AbstractQueueVisitor implements QuiescingExecutor { executorService.execute(runnable); } + private synchronized void maybeSaveUnhandledThrowable(Throwable e, boolean markToStopJobs) { + boolean critical = false; + ErrorClassification errorClassification = errorClassifier.classify(e); + switch (errorClassification) { + case AS_CRITICAL_AS_POSSIBLE: + case CRITICAL_AND_LOG: + critical = true; + LOG.log(Level.WARNING, "Found critical error in queue visitor", e); + break; + case CRITICAL: + critical = true; + break; + default: + break; + } + errorHandler.handle(e, errorClassification); + if (unhandled == null + || errorClassification.compareTo(errorClassifier.classify(unhandled)) > 0) { + // Save the most severe error. + unhandled = e; + exceptionLatch.countDown(); + } + if (markToStopJobs) { + synchronized (zeroRemainingTasks) { + if (critical && !jobsMustBeStopped) { + jobsMustBeStopped = true; + // This introduces a benign race, but it's the best we can do. When we have multiple + // errors of the same severity that is at least CRITICAL, we'll end up saving (above) and + // propagating (in 'awaitQuiescence') the most severe one we see, but the set of errors we + // see is non-deterministic and is at the mercy of how quickly the calling thread of + // 'awaitQuiescence' can do its thing after this 'notify' call. + zeroRemainingTasks.notify(); + } + } + } + } + private void recordError(Throwable e) { try { // If threadInterrupted is true, then RejectedExecutionExceptions are expected. There's no @@ -422,12 +468,7 @@ public class AbstractQueueVisitor implements QuiescingExecutor { return; } catastrophe = e; - synchronized (this) { - if (unhandled == null) { // save only the first one. - unhandled = e; - exceptionLatch.countDown(); - } - } + maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ false); } finally { decrementRemainingTasks(); } @@ -438,13 +479,12 @@ public class AbstractQueueVisitor implements QuiescingExecutor { * <ul> * <li>Sets {@link #run} to {@code true} when {@code WrappedRunnable} is run, * <li>Records the thread evaluating {@code r} in {@link #jobs} while {@code r} is evaluated, - * <li>Prevents {@param runnable} from being invoked if {@link #blockNewActions} returns + * <li>Prevents {@link #originalRunnable} from being invoked if {@link #blockNewActions} returns * {@code true}, * <li>Synchronously invokes {@code runnable.run()}, - * <li>Catches any {@link Throwable} thrown by {@code runnable.run()}, and if it is the first - * {@link Throwable} seen by this {@link AbstractQueueVisitor}, assigns it to {@link - * #unhandled}, and calls {@link #markToStopAllJobsIfNeeded} to set {@link #jobsMustBeStopped} - * if necessary, + * <li>Catches any {@link Throwable} thrown by {@code runnable.run()}, and if it is the most + * severe {@link Throwable} seen by this {@link AbstractQueueVisitor}, assigns it to + * {@link #unhandled}, and sets {@link #jobsMustBeStopped} if necessary, * <li>And, lastly, calls {@link #decrementRemainingTasks}. * </ul> */ @@ -473,13 +513,7 @@ public class AbstractQueueVisitor implements QuiescingExecutor { } originalRunnable.run(); } catch (Throwable e) { - synchronized (AbstractQueueVisitor.this) { - if (unhandled == null) { // save only the first one. - unhandled = e; - exceptionLatch.countDown(); - } - markToStopAllJobsIfNeeded(e); - } + maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ true); } finally { try { if (thread != null && addedJob) { @@ -635,35 +669,4 @@ public class AbstractQueueVisitor implements QuiescingExecutor { } } } - - /** - * Classifies a {@link Throwable} {@param e} thrown by a job. - * - * <p>If it is classified as critical, then this sets the {@link #jobsMustBeStopped} flag to - * {@code true} which signals {@link #awaitTermination(boolean)} to stop all jobs. - * - * <p>Also logs details about {@param e} if it is classified as something that must be logged. - */ - private void markToStopAllJobsIfNeeded(Throwable e) { - boolean critical = false; - ErrorClassification errorClassification = errorClassifier.classify(e); - switch (errorClassification) { - case CRITICAL_AND_LOG: - critical = true; - LOG.log(Level.WARNING, "Found critical error in queue visitor", e); - break; - case CRITICAL: - critical = true; - break; - default: - break; - } - errorHandler.handle(e, errorClassification); - synchronized (zeroRemainingTasks) { - if (critical && !jobsMustBeStopped) { - jobsMustBeStopped = true; - zeroRemainingTasks.notify(); - } - } - } } diff --git a/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java b/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java index 5a304198a9..2e832c1fd8 100644 --- a/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java +++ b/src/main/java/com/google/devtools/build/lib/concurrent/ErrorClassifier.java @@ -18,14 +18,20 @@ import com.google.devtools.build.lib.util.Preconditions; /** A classifier for {@link Error}s and {@link Exception}s. Used by {@link AbstractQueueVisitor}. */ public abstract class ErrorClassifier { - /** Classification of an error thrown by an action. */ + /** + * Classification of an error thrown by an action. + * + * <p>N.B. - These enum values are ordered from least severe to most severe. + */ public enum ErrorClassification { + /** Other running actions should be left alone.*/ + NOT_CRITICAL, /** All running actions should be stopped.*/ CRITICAL, - /** Same as CRITICAL, but also log the error.*/ + /** Same as {@link #CRITICAL}, but also log the error.*/ CRITICAL_AND_LOG, - /** Other running actions should be left alone.*/ - NOT_CRITICAL + /** Same as {@link #CRITICAL_AND_LOG}, but is even worse. */ + AS_CRITICAL_AS_POSSIBLE } /** Always treat exceptions as {@code NOT_CRITICAL}. */ @@ -39,19 +45,19 @@ public abstract class ErrorClassifier { /** * Used by {@link #classify} to classify {@link Exception}s. (Note that {@link Error}s - * are always classified as {@code CRITICAL_AND_LOG}.) + * are always classified as {@code AS_CRITICAL_AS_POSSIBLE}.) * * @param e the exception object to check */ protected abstract ErrorClassification classifyException(Exception e); /** - * Classify {@param e}. If {@code e} is an {@link Error}, it will be classified as {@code - * CRITICAL_AND_LOG}. Otherwise, calls {@link #classifyException}. + * Classify {@code e}. If {@code e} is an {@link Error}, it will be classified as + * {@code AS_CRITICAL_AS_POSSIBLE}. Otherwise, calls {@link #classifyException}. */ public final ErrorClassification classify(Throwable e) { if (e instanceof Error) { - return ErrorClassification.CRITICAL_AND_LOG; + return ErrorClassification.AS_CRITICAL_AS_POSSIBLE; } Preconditions.checkArgument(e instanceof Exception, e); return classifyException((Exception) e); diff --git a/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java b/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java index 2bbf62d175..7353e5425c 100644 --- a/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java +++ b/src/main/java/com/google/devtools/build/skyframe/ParallelEvaluator.java @@ -735,6 +735,10 @@ public final class ParallelEvaluator implements Evaluator { return ErrorClassification.CRITICAL; } if (e instanceof RuntimeException) { + // We treat non-SchedulerException RuntimeExceptions as more severe than + // SchedulerExceptions so that AbstractQueueVisitor will propagate instances of the + // former. They indicate actual Blaze bugs, rather than normal Skyframe evaluation + // control flow. return ErrorClassification.CRITICAL_AND_LOG; } return ErrorClassification.NOT_CRITICAL; diff --git a/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java b/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java index 7a3982ed12..8f6d389b98 100644 --- a/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java +++ b/src/test/java/com/google/devtools/build/lib/concurrent/AbstractQueueVisitorTest.java @@ -34,6 +34,7 @@ import org.junit.runners.JUnit4; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; @@ -408,7 +409,7 @@ public class AbstractQueueVisitorTest { @Override public void handle(Throwable t, ErrorClassification classification) { if (t == error) { - assertThat(classification).isEqualTo(ErrorClassification.CRITICAL_AND_LOG); + assertThat(classification).isEqualTo(ErrorClassification.AS_CRITICAL_AS_POSSIBLE); criticalErrorSeen.compareAndSet(false, true); } else { fail(); @@ -459,6 +460,81 @@ public class AbstractQueueVisitorTest { assertTrue(criticalErrorSeen.get()); } + private static class ClassifiedException extends RuntimeException { + private final ErrorClassification classification; + + private ClassifiedException(ErrorClassification classification) { + this.classification = classification; + } + } + + @Test + public void mostSevereErrorPropagated() throws Exception { + ThreadPoolExecutor executor = new ThreadPoolExecutor(2, 2, 0, TimeUnit.SECONDS, + new LinkedBlockingQueue<Runnable>()); + final Set<Throwable> seenErrors = Sets.newConcurrentHashSet(); + final ClassifiedException criticalException = + new ClassifiedException(ErrorClassification.CRITICAL); + final ClassifiedException criticalAndLogException = + new ClassifiedException(ErrorClassification.CRITICAL_AND_LOG); + final ErrorClassifier errorClassifier = new ErrorClassifier() { + @Override + protected ErrorClassification classifyException(Exception e) { + return (e instanceof ClassifiedException) + ? ((ClassifiedException) e).classification + : ErrorClassification.NOT_CRITICAL; + } + }; + ErrorHandler errorHandler = new ErrorHandler() { + @Override + public void handle(Throwable t, ErrorClassification classification) { + assertThat(classification).isEqualTo(errorClassifier.classify(t)); + seenErrors.add(t); + } + }; + AbstractQueueVisitor visitor = + new AbstractQueueVisitor( + /*concurrent=*/ true, + executor, + /*shutdownOnCompletion=*/ true, + /*failFastOnException=*/ false, + errorClassifier, + errorHandler); + final CountDownLatch exnLatch = visitor.getExceptionLatchForTestingOnly(); + Runnable criticalExceptionRunnable = new Runnable() { + @Override + public void run() { + throw criticalException; + } + }; + Runnable criticalAndLogExceptionRunnable = new Runnable() { + @Override + public void run() { + // Wait for the critical exception to be thrown. There's a benign race between our 'await' + // call completing because the exception latch was counted down, and our thread being + // interrupted by AbstractQueueVisitor because the critical error was encountered. This is + // completely fine; all that matters is that we have a chance to throw our error _after_ + // the previous one was thrown by the other Runnable. + try { + exnLatch.await(); + } catch (InterruptedException e) { + // Ignored. + } + throw criticalAndLogException; + } + }; + visitor.execute(criticalExceptionRunnable); + visitor.execute(criticalAndLogExceptionRunnable); + ClassifiedException exn = null; + try { + visitor.awaitQuiescence(/*interruptWorkers=*/ true); + } catch (ClassifiedException e) { + exn = e; + } + assertEquals(criticalAndLogException, exn); + assertThat(seenErrors).containsExactly(criticalException, criticalAndLogException); + } + private static Runnable throwingRunnable() { return new Runnable() { @Override diff --git a/src/test/java/com/google/devtools/build/lib/concurrent/ErrorClassifierTest.java b/src/test/java/com/google/devtools/build/lib/concurrent/ErrorClassifierTest.java new file mode 100644 index 0000000000..7c06270731 --- /dev/null +++ b/src/test/java/com/google/devtools/build/lib/concurrent/ErrorClassifierTest.java @@ -0,0 +1,38 @@ +// Copyright 2016 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package com.google.devtools.build.lib.concurrent; + +import static com.google.common.truth.Truth.assertThat; + +import com.google.devtools.build.lib.concurrent.ErrorClassifier.ErrorClassification; +import java.util.Arrays; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link ErrorClassifier}. */ +@RunWith(JUnit4.class) +public class ErrorClassifierTest { + @Test + public void testErrorClassificationNaturalOrder() { + ErrorClassification[] values = ErrorClassification.values(); + Arrays.sort(values); + assertThat(values).asList().containsExactly( + ErrorClassification.NOT_CRITICAL, + ErrorClassification.CRITICAL, + ErrorClassification.CRITICAL_AND_LOG, + ErrorClassification.AS_CRITICAL_AS_POSSIBLE).inOrder(); + } +} + |