aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar George Gensure <ggensure@uber.com>2018-07-10 08:02:22 -0700
committerGravatar Copybara-Service <copybara-piper@google.com>2018-07-10 08:03:45 -0700
commitba76255ea973d22c3cf5456eda93d301ed4b4626 (patch)
treefd3c58bed217bd811256ca07261d5042950a5488
parentf0ced483576200de9c0ef05c5b5831a5952bf8b8 (diff)
Synchronize on process factory to inhibit ETXTBSY
Refocus synchronization mechanism to cope with file descriptor set fork- induced races to more tightly constrain concurrent fork/exec pairs. This problem has been observed in bazel proper repeatedly, exhibiting as the iconic ETXTBSY - Text file busy in wide worker pool builds and tests. Evidence that this was discovered by @buchgr is in the comment and change to the embedded ExecutionService implementation, and the description of the race and the need for the synchronization was lifted from that scope to the JavaSubprocessFactory. This factory is a singleton and represents the gateway to all worker process execution, and serves as the correct lock primitive to ensure that file descriptor sets are not duplicated across forks, which gave rise to this issue. To test this, I demonstrated a reproducer presented at https://bugs.java.com/view_bug.do?bug_id=8068370 with 2.4% of invocations in that pathological case exhibiting the issue. With a functionally equivalent change - synchronizing around a processBuilder.start() call - as the only modification to the reproducer, no further failures of any kind were observed, over several hundred runs. Closes #5556. PiperOrigin-RevId: 203947224
-rw-r--r--src/main/java/com/google/devtools/build/lib/shell/JavaSubprocessFactory.java24
-rw-r--r--src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java26
2 files changed, 27 insertions, 23 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/shell/JavaSubprocessFactory.java b/src/main/java/com/google/devtools/build/lib/shell/JavaSubprocessFactory.java
index bfc867d991..c282d57ab6 100644
--- a/src/main/java/com/google/devtools/build/lib/shell/JavaSubprocessFactory.java
+++ b/src/main/java/com/google/devtools/build/lib/shell/JavaSubprocessFactory.java
@@ -119,6 +119,28 @@ public class JavaSubprocessFactory implements SubprocessFactory {
// We are a singleton
}
+ // since we are a singleton, we represent an ideal global lock for
+ // process invocations, which is required due to the following race condition:
+
+ // Linux does not provide a safe API for a multi-threaded program to fork a subprocess.
+ // Consider the case where two threads both write an executable file and then try to execute
+ // it. It can happen that the first thread writes its executable file, with the file
+ // descriptor still being open when the second thread forks, with the fork inheriting a copy
+ // of the file descriptor. Then the first thread closes the original file descriptor, and
+ // proceeds to execute the file. At that point Linux sees an open file descriptor to the file
+ // and returns ETXTBSY (Text file busy) as an error. This race is inherent in the fork / exec
+ // duality, with fork always inheriting a copy of the file descriptor table; if there was a
+ // way to fork without copying the entire file descriptor table (e.g., only copy specific
+ // entries), we could avoid this race.
+ //
+ // I was able to reproduce this problem reliably by running significantly more threads than
+ // there are CPU cores on my workstation - the more threads the more likely it happens.
+ //
+ // As a workaround, we put a synchronized block around the fork.
+ private synchronized Process start(ProcessBuilder builder) throws IOException {
+ return builder.start();
+ }
+
@Override
public Subprocess create(SubprocessBuilder params) throws IOException {
ProcessBuilder builder = new ProcessBuilder();
@@ -137,7 +159,7 @@ public class JavaSubprocessFactory implements SubprocessFactory {
long deadlineMillis = params.getTimeoutMillis() > 0
? Math.addExact(System.currentTimeMillis(), params.getTimeoutMillis())
: 0;
- return new JavaSubprocess(builder.start(), deadlineMillis);
+ return new JavaSubprocess(start(builder), deadlineMillis);
}
/**
diff --git a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java
index f9506987f6..19556dc9bd 100644
--- a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java
+++ b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java
@@ -76,7 +76,6 @@ import java.util.logging.Logger;
final class ExecutionServer extends ExecutionImplBase {
private static final Logger logger = Logger.getLogger(ExecutionServer.class.getName());
- private final Object lock = new Object();
// The name of the container image entry in the Platform proto
// (see third_party/googleapis/devtools/remoteexecution/*/remote_execution.proto and
@@ -215,27 +214,10 @@ final class ExecutionServer extends ExecutionImplBase {
CommandResult cmdResult = null;
FutureCommandResult futureCmdResult = null;
- synchronized (lock) {
- // Linux does not provide a safe API for a multi-threaded program to fork a subprocess.
- // Consider the case where two threads both write an executable file and then try to execute
- // it. It can happen that the first thread writes its executable file, with the file
- // descriptor still being open when the second thread forks, with the fork inheriting a copy
- // of the file descriptor. Then the first thread closes the original file descriptor, and
- // proceeds to execute the file. At that point Linux sees an open file descriptor to the file
- // and returns ETXTBSY (Text file busy) as an error. This race is inherent in the fork / exec
- // duality, with fork always inheriting a copy of the file descriptor table; if there was a
- // way to fork without copying the entire file descriptor table (e.g., only copy specific
- // entries), we could avoid this race.
- //
- // I was able to reproduce this problem reliably by running significantly more threads than
- // there are CPU cores on my workstation - the more threads the more likely it happens.
- //
- // As a workaround, we put a synchronized block around the fork.
- try {
- futureCmdResult = cmd.executeAsync();
- } catch (CommandException e) {
- Throwables.throwIfInstanceOf(e.getCause(), IOException.class);
- }
+ try {
+ futureCmdResult = cmd.executeAsync();
+ } catch (CommandException e) {
+ Throwables.throwIfInstanceOf(e.getCause(), IOException.class);
}
if (futureCmdResult != null) {