aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar buchgr <buchgr@google.com>2017-08-07 14:42:06 +0200
committerGravatar Jakob Buchgraber <buchgr@google.com>2017-08-07 14:49:59 +0200
commit91fb38e92ace6cf14ce5da6527d71320b4e3f3d2 (patch)
tree3a98f1c9470063a3e17b95ad4c127f205b5d9c01
parent904563c35b23e3b3f3c171ab0a6189dccd384b39 (diff)
remote_worker: Serialize fork() calls. Fixes #3356
Spawning processes concurrently from multiple threads doesn't work reliably on Linux (see bug for details). This change introduces simply puts a synchronized block around fork(). I have tested this change by repeatedly running a build of bazel for 2 hours on a 64 core machine with up to 200 concurrent actions. There wasn't a single failure. PiperOrigin-RevId: 164450559
-rw-r--r--src/tools/remote_worker/src/main/java/com/google/devtools/build/remote/ExecutionServer.java64
1 files changed, 34 insertions, 30 deletions
diff --git a/src/tools/remote_worker/src/main/java/com/google/devtools/build/remote/ExecutionServer.java b/src/tools/remote_worker/src/main/java/com/google/devtools/build/remote/ExecutionServer.java
index 3e58092cdf..417e742c32 100644
--- a/src/tools/remote_worker/src/main/java/com/google/devtools/build/remote/ExecutionServer.java
+++ b/src/tools/remote_worker/src/main/java/com/google/devtools/build/remote/ExecutionServer.java
@@ -32,6 +32,7 @@ import com.google.devtools.build.lib.shell.AbnormalTerminationException;
import com.google.devtools.build.lib.shell.Command;
import com.google.devtools.build.lib.shell.CommandException;
import com.google.devtools.build.lib.shell.CommandResult;
+import com.google.devtools.build.lib.shell.FutureCommandResult;
import com.google.devtools.build.lib.shell.TimeoutKillableObserver;
import com.google.devtools.build.lib.vfs.FileSystemUtils;
import com.google.devtools.build.lib.vfs.Path;
@@ -49,6 +50,7 @@ import com.google.rpc.Status;
import io.grpc.StatusException;
import io.grpc.protobuf.StatusProto;
import io.grpc.stub.StreamObserver;
+import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
@@ -71,6 +73,8 @@ import java.util.logging.Logger;
final class ExecutionServer extends ExecutionImplBase {
private static final Logger logger = Logger.getLogger(ExecutionServer.class.getName());
+ private final Object lock = new Object();
+
// The name of the container image entry in the Platform proto
// (see third_party/googleapis/devtools/remoteexecution/*/remote_execution.proto and
// experimental_remote_platform_override in
@@ -200,41 +204,41 @@ final class ExecutionServer extends ExecutionImplBase {
execRoot.getPathString());
long startTime = System.currentTimeMillis();
CommandResult cmdResult = null;
- // Linux does not provide a safe API for a multi-threaded program to fork a subprocess. Consider
- // the case where two threads both write an executable file and then try to execute it. It can
- // happen that the first thread writes its executable file, with the file descriptor still
- // being open when the second thread forks, with the fork inheriting a copy of the file
- // descriptor. Then the first thread closes the original file descriptor, and proceeds to
- // execute the file. At that point Linux sees an open file descriptor to the file and returns
- // ETXTBSY (Text file busy) as an error. This race is inherent in the fork / exec duality, with
- // fork always inheriting a copy of the file descriptor table; if there was a way to fork
- // without copying the entire file descriptor table (e.g., only copy specific entries), we could
- // avoid this race.
- //
- // I was able to reproduce this problem reliably by running significantly more threads than
- // there are CPU cores on my workstation - the more threads the more likely it happens.
- //
- // As a workaround, we retry up to two times before we let the exception propagate.
- int attempt = 0;
- while (true) {
+
+ FutureCommandResult futureCmdResult = null;
+ synchronized (lock) {
+ // Linux does not provide a safe API for a multi-threaded program to fork a subprocess.
+ // Consider the case where two threads both write an executable file and then try to execute
+ // it. It can happen that the first thread writes its executable file, with the file
+ // descriptor still being open when the second thread forks, with the fork inheriting a copy
+ // of the file descriptor. Then the first thread closes the original file descriptor, and
+ // proceeds to execute the file. At that point Linux sees an open file descriptor to the file
+ // and returns ETXTBSY (Text file busy) as an error. This race is inherent in the fork / exec
+ // duality, with fork always inheriting a copy of the file descriptor table; if there was a
+ // way to fork without copying the entire file descriptor table (e.g., only copy specific
+ // entries), we could avoid this race.
+ //
+ // I was able to reproduce this problem reliably by running significantly more threads than
+ // there are CPU cores on my workstation - the more threads the more likely it happens.
+ //
+ // As a workaround, we put a synchronized block around the fork.
+ try {
+ futureCmdResult = cmd
+ .executeAsynchronously(new ByteArrayInputStream(new byte[0]), Command.NO_OBSERVER,
+ stdoutBuffer, stderrBuffer, true, false);
+ } catch (CommandException e) {
+ Throwables.throwIfInstanceOf(e.getCause(), IOException.class);
+ }
+ }
+
+ if (futureCmdResult != null) {
try {
- cmdResult =
- cmd.execute(Command.NO_INPUT, Command.NO_OBSERVER, stdoutBuffer, stderrBuffer, true);
+ cmdResult = futureCmdResult.get();
} catch (AbnormalTerminationException e) {
cmdResult = e.getResult();
- } catch (CommandException e) {
- // As of this writing, the cause can only be an IOException from the underlying library.
- IOException cause = (IOException) e.getCause();
- if ((attempt++ < 3) && cause.getMessage().endsWith("Text file busy")) {
- // We wait a bit to give the other forks some time to close their open file descriptors.
- Thread.sleep(10);
- continue;
- } else {
- throw cause;
- }
}
- break;
}
+
long timeoutMillis =
action.hasTimeout()
? Durations.toMillis(action.getTimeout())