Ensure our "Another command is running" messages hold realistic PIDs.

Instead of blocking indefinitely for the server lock to become available when other commands are running, busy-wait for the lock to be released. This allows us to detect changes in the PID of the client that is holding the lock, and thus lets us make our wait messages more accurate. There have been multiple bug reports over time (especially from macOS users) where they complain that Bazel is stuck waiting for a non-existent PID. The code doesn't look obviously bogus, so this might just be a case of confusion based on the printed PID. By improving the diagnostic messages we output, we'll either make this confusion go away or have a chance of gathering more data when/if this happens again. This change has the side-effect of homogenizing the wait messages printed by both the Blaze client and the Blaze server and also adds details to know which component is printing what. PiperOrigin-RevId: 166508406
author: jmmv <jmmv@google.com> 2017-08-25 21:14:27 +0200
committer: Vladimir Moskva <vladmos@google.com> 2017-08-28 16:08:18 +0200
commit: 3ab810b1fcaa3aaf88d39a9ce1b47917870f845d (patch)
tree: 0b2fcf610353db9922e302163992a025beca0070 /src/main
parent: 658143c1b48c3ddb60f7f60544ff943486b201c9 (diff)
2 files changed, 62 insertions, 32 deletions
diff --git a/src/main/cpp/blaze_util_posix.cc b/src/main/cpp/blaze_util_posix.cc
index 55567b9f93..a53b9316d4 100644
--- a/src/main/cpp/blaze_util_posix.cc
+++ b/src/main/cpp/blaze_util_posix.cc
@@ -579,15 +579,25 @@ uint64_t AcquireLock(const string& output_base, bool batch_mode, bool block,
   // later if that becomes meaningful.  (Ranges beyond EOF can be locked.)
   lock.l_len = 4096;
 
-  uint64_t wait_time = 0;
-  // Try to take the lock, without blocking.
-  if (fcntl(lockfd, F_SETLK, &lock) == -1) {
+  // Take the exclusive server lock.  If we fail, we busy-wait until the lock
+  // becomes available.
+  //
+  // We used to rely on fcntl(F_SETLKW) to lazy-wait for the lock to become
+  // available, which is theoretically fine, but doing so prevents us from
+  // determining if the PID of the server holding the lock has changed under the
+  // hood.  There have been multiple bug reports where users (especially macOS
+  // ones) mention that the Blaze invocation hangs on a non-existent PID.  This
+  // should help troubleshoot those scenarios in case there really is a bug
+  // somewhere.
+  size_t attempts = 0;
+  pid_t other_pid = 0;
+  const uint64_t start_time = GetMillisecondsMonotonic();
+  while (fcntl(lockfd, F_SETLK, &lock) == -1) {
     if (errno != EACCES && errno != EAGAIN) {
       pdie(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR,
            "unexpected result from F_SETLK");
     }
 
-    // We didn't get the lock.  Find out who has it.
     struct flock probe = lock;
     probe.l_pid = 0;
     if (fcntl(lockfd, F_GETLK, &probe) == -1) {
@@ -599,26 +609,29 @@ uint64_t AcquireLock(const string& output_base, bool batch_mode, bool block,
           "Another command is running (pid=%d). Exiting immediately.",
           probe.l_pid);
     }
-    fprintf(stderr, "Another command is running (pid = %d).  "
-            "Waiting for it to complete...", probe.l_pid);
-    fflush(stderr);
-
-    // Take a clock sample for that start of the waiting time
-    uint64_t st = GetMillisecondsMonotonic();
-    // Try to take the lock again (blocking).
-    int r;
-    do {
-      r = fcntl(lockfd, F_SETLKW, &lock);
-    } while (r == -1 && errno == EINTR);
-    fprintf(stderr, "\n");
-    if (r == -1) {
-      pdie(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR,
-           "couldn't acquire file lock");
+    if (other_pid != probe.l_pid) {
+      if (attempts > 0) {
+        fprintf(stderr, " lock taken by another command\n");
+      }
+      fprintf(stderr, "Another command (pid=%d) is running.  "
+              "Waiting for it to complete on the client...", probe.l_pid);
+      fflush(stderr);
+      other_pid = probe.l_pid;
     }
-    // Take another clock sample, calculate elapsed
-    uint64_t et = GetMillisecondsMonotonic();
-    wait_time = et - st;
+
+    TrySleep(500);
+    attempts += 1;
   }
+  if (attempts > 0) {
+    fprintf(stderr, " done!\n");
+  }
+  const uint64_t end_time = GetMillisecondsMonotonic();
+
+  // If we took the lock on the first try, force the reported wait time to 0 to
+  // avoid unnecessary noise in the logs.  In this metric, we are only
+  // interested in knowing how long it took for other commands to complete, not
+  // how fast acquiring a lock is.
+  const uint64_t wait_time = attempts == 0 ? 0 : end_time - start_time;
 
   // Identify ourselves in the lockfile.
   (void) ftruncate(lockfd, 0);
diff --git a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java
index bfa3f28489..c6313276cb 100644
--- a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java
+++ b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java
@@ -295,21 +295,29 @@ public class BlazeCommandDispatcher {
       return ExitCode.COMMAND_LINE_ERROR.getNumericExitCode();
     }
 
-
-    long waitTimeInMs = 0;
+    // Take the exclusive server lock.  If we fail, we busy-wait until the lock becomes available.
+    //
+    // We used to rely on commandLock.wait() to lazy-wait for the lock to become available, which is
+    // theoretically fine, but doing so prevents us from determining if the PID of the server
+    // holding the lock has changed under the hood.  There have been multiple bug reports where
+    // users (especially macOS ones) mention that the Blaze invocation hangs on a non-existent PID.
+    // This should help troubleshoot those scenarios in case there really is a bug somewhere.
+    int attempts = 0;
+    long clockBefore = BlazeClock.nanoTime();
+    String otherClientDescription = "";
     synchronized (commandLock) {
-      boolean warningPrinted = false;
       while (currentClientDescription != null) {
         switch (lockingMode) {
           case WAIT:
-            if (!warningPrinted) {
-              outErr.printErrLn("Another command (" + currentClientDescription + ") is running. "
-                  + " Waiting for it to complete...");
-              warningPrinted = true;
+            if (otherClientDescription != currentClientDescription) {
+              if (attempts > 0) {
+                outErr.printErrLn(" lock taken by another command");
+              }
+              outErr.printErr("Another command (" + currentClientDescription + ") is running. "
+                  + " Waiting for it to complete on the server...");
+              otherClientDescription = currentClientDescription;
             }
-            long clockBefore = BlazeClock.nanoTime();
-            commandLock.wait();
-            waitTimeInMs = (BlazeClock.nanoTime() - clockBefore) / (1000L * 1000L);
+            commandLock.wait(500);
             break;
 
           case ERROR_OUT:
@@ -320,10 +328,19 @@ public class BlazeCommandDispatcher {
           default:
             throw new IllegalStateException();
         }
+
+        attempts += 1;
       }
       Verify.verify(currentClientDescription == null);
       currentClientDescription = clientDescription;
     }
+    if (attempts > 0) {
+      outErr.printErrLn(" done!");
+    }
+    // If we took the lock on the first try, force the reported wait time to 0 to avoid unnecessary
+    // noise in the logs.  In this metric, we are only interested in knowing how long it took for
+    // other commands to complete, not how fast acquiring a lock is.
+    long waitTimeInMs = attempts == 0 ? 0 : (BlazeClock.nanoTime() - clockBefore) / (1000L * 1000L);
 
     try {
       if (shutdownReason != null) {
author	jmmv <jmmv@google.com>	2017-08-25 21:14:27 +0200
committer	Vladimir Moskva <vladmos@google.com>	2017-08-28 16:08:18 +0200
commit	3ab810b1fcaa3aaf88d39a9ce1b47917870f845d (patch)
tree	0b2fcf610353db9922e302163992a025beca0070 /src/main
parent	658143c1b48c3ddb60f7f60544ff943486b201c9 (diff)