aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java
diff options
context:
space:
mode:
authorGravatar jmmv <jmmv@google.com>2017-08-25 21:14:27 +0200
committerGravatar Vladimir Moskva <vladmos@google.com>2017-08-28 16:08:18 +0200
commit3ab810b1fcaa3aaf88d39a9ce1b47917870f845d (patch)
tree0b2fcf610353db9922e302163992a025beca0070 /src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java
parent658143c1b48c3ddb60f7f60544ff943486b201c9 (diff)
Ensure our "Another command is running" messages hold realistic PIDs.
Instead of blocking indefinitely for the server lock to become available when other commands are running, busy-wait for the lock to be released. This allows us to detect changes in the PID of the client that is holding the lock, and thus lets us make our wait messages more accurate. There have been multiple bug reports over time (especially from macOS users) where they complain that Bazel is stuck waiting for a non-existent PID. The code doesn't look obviously bogus, so this might just be a case of confusion based on the printed PID. By improving the diagnostic messages we output, we'll either make this confusion go away or have a chance of gathering more data when/if this happens again. This change has the side-effect of homogenizing the wait messages printed by both the Blaze client and the Blaze server and also adds details to know which component is printing what. PiperOrigin-RevId: 166508406
Diffstat (limited to 'src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java')
-rw-r--r--src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java37
1 files changed, 27 insertions, 10 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java
index bfa3f28489..c6313276cb 100644
--- a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java
+++ b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java
@@ -295,21 +295,29 @@ public class BlazeCommandDispatcher {
return ExitCode.COMMAND_LINE_ERROR.getNumericExitCode();
}
-
- long waitTimeInMs = 0;
+ // Take the exclusive server lock. If we fail, we busy-wait until the lock becomes available.
+ //
+ // We used to rely on commandLock.wait() to lazy-wait for the lock to become available, which is
+ // theoretically fine, but doing so prevents us from determining if the PID of the server
+ // holding the lock has changed under the hood. There have been multiple bug reports where
+ // users (especially macOS ones) mention that the Blaze invocation hangs on a non-existent PID.
+ // This should help troubleshoot those scenarios in case there really is a bug somewhere.
+ int attempts = 0;
+ long clockBefore = BlazeClock.nanoTime();
+ String otherClientDescription = "";
synchronized (commandLock) {
- boolean warningPrinted = false;
while (currentClientDescription != null) {
switch (lockingMode) {
case WAIT:
- if (!warningPrinted) {
- outErr.printErrLn("Another command (" + currentClientDescription + ") is running. "
- + " Waiting for it to complete...");
- warningPrinted = true;
+ if (otherClientDescription != currentClientDescription) {
+ if (attempts > 0) {
+ outErr.printErrLn(" lock taken by another command");
+ }
+ outErr.printErr("Another command (" + currentClientDescription + ") is running. "
+ + " Waiting for it to complete on the server...");
+ otherClientDescription = currentClientDescription;
}
- long clockBefore = BlazeClock.nanoTime();
- commandLock.wait();
- waitTimeInMs = (BlazeClock.nanoTime() - clockBefore) / (1000L * 1000L);
+ commandLock.wait(500);
break;
case ERROR_OUT:
@@ -320,10 +328,19 @@ public class BlazeCommandDispatcher {
default:
throw new IllegalStateException();
}
+
+ attempts += 1;
}
Verify.verify(currentClientDescription == null);
currentClientDescription = clientDescription;
}
+ if (attempts > 0) {
+ outErr.printErrLn(" done!");
+ }
+ // If we took the lock on the first try, force the reported wait time to 0 to avoid unnecessary
+ // noise in the logs. In this metric, we are only interested in knowing how long it took for
+ // other commands to complete, not how fast acquiring a lock is.
+ long waitTimeInMs = attempts == 0 ? 0 : (BlazeClock.nanoTime() - clockBefore) / (1000L * 1000L);
try {
if (shutdownReason != null) {