diff options
author | 2017-07-14 17:11:48 +0200 | |
---|---|---|
committer | 2017-07-17 10:10:42 +0200 | |
commit | e0b6a21750f93739de4e0ff50eb13b90c60cc286 (patch) | |
tree | 42bcc076dd62e30028c5cefb1014aa86e81eac30 /src/tools | |
parent | a76c94be7c56b93fc5a2f9ececfba7ac1f61f69c (diff) |
Workaround for the Linux-induced race that causes ETXTBSY
Retry execution when we think it's ETXTBSY.
Fixed #3356.
PiperOrigin-RevId: 161958095
Diffstat (limited to 'src/tools')
-rw-r--r-- | src/tools/remote_worker/src/main/java/com/google/devtools/build/remote/ExecutionServer.java | 43 |
1 files changed, 34 insertions, 9 deletions
diff --git a/src/tools/remote_worker/src/main/java/com/google/devtools/build/remote/ExecutionServer.java b/src/tools/remote_worker/src/main/java/com/google/devtools/build/remote/ExecutionServer.java index 0d38e4039e..3e58092cdf 100644 --- a/src/tools/remote_worker/src/main/java/com/google/devtools/build/remote/ExecutionServer.java +++ b/src/tools/remote_worker/src/main/java/com/google/devtools/build/remote/ExecutionServer.java @@ -200,15 +200,40 @@ final class ExecutionServer extends ExecutionImplBase { execRoot.getPathString()); long startTime = System.currentTimeMillis(); CommandResult cmdResult = null; - try { - cmdResult = - cmd.execute(Command.NO_INPUT, Command.NO_OBSERVER, stdoutBuffer, stderrBuffer, true); - } catch (AbnormalTerminationException e) { - cmdResult = e.getResult(); - } catch (CommandException e) { - // At the time this comment was written, this must be a ExecFailedException encapsulating - // an IOException from the underlying Subprocess.Factory. - cmdResult = null; + // Linux does not provide a safe API for a multi-threaded program to fork a subprocess. Consider + // the case where two threads both write an executable file and then try to execute it. It can + // happen that the first thread writes its executable file, with the file descriptor still + // being open when the second thread forks, with the fork inheriting a copy of the file + // descriptor. Then the first thread closes the original file descriptor, and proceeds to + // execute the file. At that point Linux sees an open file descriptor to the file and returns + // ETXTBSY (Text file busy) as an error. This race is inherent in the fork / exec duality, with + // fork always inheriting a copy of the file descriptor table; if there was a way to fork + // without copying the entire file descriptor table (e.g., only copy specific entries), we could + // avoid this race. + // + // I was able to reproduce this problem reliably by running significantly more threads than + // there are CPU cores on my workstation - the more threads the more likely it happens. + // + // As a workaround, we retry up to two times before we let the exception propagate. + int attempt = 0; + while (true) { + try { + cmdResult = + cmd.execute(Command.NO_INPUT, Command.NO_OBSERVER, stdoutBuffer, stderrBuffer, true); + } catch (AbnormalTerminationException e) { + cmdResult = e.getResult(); + } catch (CommandException e) { + // As of this writing, the cause can only be an IOException from the underlying library. + IOException cause = (IOException) e.getCause(); + if ((attempt++ < 3) && cause.getMessage().endsWith("Text file busy")) { + // We wait a bit to give the other forks some time to close their open file descriptors. + Thread.sleep(10); + continue; + } else { + throw cause; + } + } + break; } long timeoutMillis = action.hasTimeout() |