diff options
author | Lukacs Berki <lberki@google.com> | 2017-01-13 14:04:08 +0000 |
---|---|---|
committer | Marcel Hlopko <hlopko@google.com> | 2017-01-13 16:12:56 +0000 |
commit | 9f60395a04410abe3068c3e0fafbc917193d3f8a (patch) | |
tree | e57b2c42a127b4cf555631367e51aea1040103d0 /src/main/java/com/google | |
parent | 4e2509480fadc60937f640587a2a30908dbe73a9 (diff) |
Make the server commit suicide if its PID file vanishes.
--
PiperOrigin-RevId: 144434688
MOS_MIGRATED_REVID=144434688
Diffstat (limited to 'src/main/java/com/google')
8 files changed, 141 insertions, 76 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java index d13963f7bc..c9be71ad31 100644 --- a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java +++ b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java @@ -85,62 +85,24 @@ public class BlazeCommandDispatcher { private static final Set<String> ALL_HELP_OPTIONS = ImmutableSet.of("--help", "-help", "-h"); /** - * If the server needs to be shut down and how. - */ - public enum ShutdownMethod { - /** The server doesn't need to be shut down. */ - NONE, - - /** The server is to be shut down cleanly, e.g. as a result of "blaze shutdown". */ - CLEAN, - - /** - * The server is shut down as a result of a "blaze clean --expunge. - * - * <p>In this case, no files should be deleted on shutdown hooks, since clean also deletes the - * lock file, and there is a small possibility of the following sequence of events: - * - * <ol> - * <li> Client 1 runs "blaze clean --expunge" - * <li> Client 2 runs a command and waits for client 1 to finish - * <li> The clean command deletes everything including the lock file - * <li> Client 2 starts running and since the output base is empty, starts up a new server, - * which creates its own socket and PID files - * <li> The server used by client runs its shutdown hooks, deleting the PID files created by - * the new server - * </ol> - */ - EXPUNGE, - } - - /** * By throwing this exception, a command indicates that it wants to shutdown * the Blaze server process. - * See {@link BlazeCommandDispatcher#exec(List, OutErr, long)}. */ public static class ShutdownBlazeServerException extends Exception { private final int exitStatus; - private final ShutdownMethod method; - public ShutdownBlazeServerException(int exitStatus, ShutdownMethod method, Throwable cause) { + public ShutdownBlazeServerException(int exitStatus, Throwable cause) { super(cause); this.exitStatus = exitStatus; - this.method = method; } - public ShutdownBlazeServerException(int exitStatus, ShutdownMethod method) { - Preconditions.checkState(method != ShutdownMethod.NONE); + public ShutdownBlazeServerException(int exitStatus) { this.exitStatus = exitStatus; - this.method = method; } public int getExitStatus() { return exitStatus; } - - public ShutdownMethod getMethod() { - return method; - } } private final BlazeRuntime runtime; @@ -526,7 +488,7 @@ public class BlazeCommandDispatcher { BugReport.printBug(outErr, e); BugReport.sendBugReport(e, args, crashData); numericExitCode = BugReport.getExitCodeForThrowable(e); - throw new ShutdownBlazeServerException(numericExitCode, ShutdownMethod.CLEAN, e); + throw new ShutdownBlazeServerException(numericExitCode, e); } finally { env.getEventBus().post(new AfterCommandEvent()); runtime.afterCommand(env, numericExitCode); diff --git a/src/main/java/com/google/devtools/build/lib/runtime/BlazeRuntime.java b/src/main/java/com/google/devtools/build/lib/runtime/BlazeRuntime.java index 532923c54a..434d0991b7 100644 --- a/src/main/java/com/google/devtools/build/lib/runtime/BlazeRuntime.java +++ b/src/main/java/com/google/devtools/build/lib/runtime/BlazeRuntime.java @@ -120,6 +120,7 @@ public final class BlazeRuntime { private final Iterable<BlazeModule> blazeModules; private final Map<String, BlazeCommand> commandMap = new LinkedHashMap<>(); private final Clock clock; + private final Runnable abruptShutdownHandler; private final PackageFactory packageFactory; private final ConfigurationFactory configurationFactory; @@ -155,6 +156,7 @@ public final class BlazeRuntime { ConfigurationFactory configurationFactory, ImmutableMap<String, InfoItem> infoItems, Clock clock, + Runnable abruptShutdownHandler, OptionsProvider startupOptionsProvider, Iterable<BlazeModule> blazeModules, SubscriberExceptionHandler eventBusExceptionHandler, @@ -174,6 +176,7 @@ public final class BlazeRuntime { this.configurationFactory = configurationFactory; this.infoItems = infoItems; this.clock = clock; + this.abruptShutdownHandler = abruptShutdownHandler; this.startupOptionsProvider = startupOptionsProvider; this.queryEnvironmentFactory = queryEnvironmentFactory; this.queryFunctions = queryFunctions; @@ -481,6 +484,12 @@ public final class BlazeRuntime { } } + public void prepareForAbruptShutdown() { + if (abruptShutdownHandler != null) { + abruptShutdownHandler.run(); + } + } + /** Invokes {@link BlazeModule#blazeShutdownOnCrash()} on all registered modules. */ public void shutdownOnCrash() { for (BlazeModule module : blazeModules) { @@ -728,7 +737,7 @@ public final class BlazeRuntime { BlazeRuntime runtime; try { - runtime = newRuntime(modules, commandLineOptions.getStartupArgs()); + runtime = newRuntime(modules, commandLineOptions.getStartupArgs(), null); } catch (OptionsParsingException e) { OutErr.SYSTEM_OUT_ERR.printErr(e.getMessage()); return ExitCode.COMMAND_LINE_ERROR.getNumericExitCode(); @@ -814,7 +823,15 @@ public final class BlazeRuntime { private static RPCServer createBlazeRPCServer( Iterable<BlazeModule> modules, List<String> args) throws IOException, OptionsParsingException, AbruptExitException { - BlazeRuntime runtime = newRuntime(modules, args); + final RPCServer[] rpcServer = new RPCServer[1]; + Runnable prepareForAbruptShutdown = new Runnable() { + @Override + public void run() { + rpcServer[0].prepareForAbruptShutdown(); + } + }; + + BlazeRuntime runtime = newRuntime(modules, args, prepareForAbruptShutdown); BlazeCommandDispatcher dispatcher = new BlazeCommandDispatcher(runtime); CommandExecutor commandExecutor = new CommandExecutor(runtime, dispatcher); @@ -825,13 +842,15 @@ public final class BlazeRuntime { // gRPC server is not compiled in so that we don't need gRPC for bootstrapping. Class<?> factoryClass = Class.forName( "com.google.devtools.build.lib.server.GrpcServerImpl$Factory"); - RPCServer.Factory factory = (RPCServer.Factory) factoryClass.getConstructor().newInstance(); - return factory.create(commandExecutor, runtime.getClock(), - startupOptions.commandPort, runtime.getServerDirectory(), - startupOptions.maxIdleSeconds); + RPCServer.Factory factory = (RPCServer.Factory) factoryClass.getConstructor().newInstance(); + rpcServer[0] = factory.create(commandExecutor, runtime.getClock(), + startupOptions.commandPort, runtime.getServerDirectory(), + startupOptions.maxIdleSeconds); + return rpcServer[0]; } catch (ReflectiveOperationException | IllegalArgumentException e) { throw new AbruptExitException("gRPC server not compiled in", ExitCode.BLAZE_INTERNAL_ERROR); } + } private static Function<String, String> sourceFunctionForMap(final Map<String, String> map) { @@ -890,7 +909,8 @@ public final class BlazeRuntime { * an error string that, if not null, describes a fatal initialization failure that makes * this runtime unsuitable for real commands */ - private static BlazeRuntime newRuntime(Iterable<BlazeModule> blazeModules, List<String> args) + private static BlazeRuntime newRuntime(Iterable<BlazeModule> blazeModules, List<String> args, + Runnable abruptShutdownHandler) throws AbruptExitException, OptionsParsingException { OptionsProvider options = parseOptions(blazeModules, args); for (BlazeModule module : blazeModules) { @@ -951,6 +971,7 @@ public final class BlazeRuntime { .setServerDirectories(serverDirectories) .setStartupOptionsProvider(options) .setClock(clock) + .setAbruptShutdownHandler(abruptShutdownHandler) // TODO(bazel-team): Make BugReportingExceptionHandler the default. // See bug "Make exceptions in EventBus subscribers fatal" .setEventBusExceptionHandler( @@ -1058,6 +1079,7 @@ public final class BlazeRuntime { public static class Builder { private ServerDirectories serverDirectories; private Clock clock; + private Runnable abruptShutdownHandler; private OptionsProvider startupOptionsProvider; private final List<BlazeModule> blazeModules = new ArrayList<>(); private SubscriberExceptionHandler eventBusExceptionHandler = new RemoteExceptionHandler(); @@ -1138,6 +1160,7 @@ public final class BlazeRuntime { configurationFactory, serverBuilder.getInfoItems(), clock, + abruptShutdownHandler, startupOptionsProvider, ImmutableList.copyOf(blazeModules), eventBusExceptionHandler, @@ -1162,6 +1185,11 @@ public final class BlazeRuntime { return this; } + public Builder setAbruptShutdownHandler(Runnable handler) { + this.abruptShutdownHandler = handler; + return this; + } + public Builder setStartupOptionsProvider(OptionsProvider startupOptionsProvider) { this.startupOptionsProvider = startupOptionsProvider; return this; diff --git a/src/main/java/com/google/devtools/build/lib/runtime/CommandExecutor.java b/src/main/java/com/google/devtools/build/lib/runtime/CommandExecutor.java index 52e15f26cf..6f5b7b6e8b 100644 --- a/src/main/java/com/google/devtools/build/lib/runtime/CommandExecutor.java +++ b/src/main/java/com/google/devtools/build/lib/runtime/CommandExecutor.java @@ -13,10 +13,8 @@ // limitations under the License. package com.google.devtools.build.lib.runtime; -import com.google.devtools.build.lib.runtime.BlazeCommandDispatcher.ShutdownMethod; import com.google.devtools.build.lib.server.ServerCommand; import com.google.devtools.build.lib.util.io.OutErr; - import java.io.PrintWriter; import java.io.StringWriter; import java.util.List; @@ -30,12 +28,12 @@ import java.util.logging.Logger; public class CommandExecutor implements ServerCommand { private static final Logger LOG = Logger.getLogger(CommandExecutor.class.getName()); - private ShutdownMethod shutdown; + private boolean shutdown; private final BlazeRuntime runtime; private final BlazeCommandDispatcher dispatcher; CommandExecutor(BlazeRuntime runtime, BlazeCommandDispatcher dispatcher) { - this.shutdown = ShutdownMethod.NONE; + this.shutdown = false; this.runtime = runtime; this.dispatcher = dispatcher; } @@ -56,7 +54,7 @@ public class CommandExecutor implements ServerCommand { writer.flush(); LOG.severe(message.toString()); } - shutdown = e.getMethod(); + shutdown = true; runtime.shutdown(); dispatcher.shutdown(); return e.getExitStatus(); @@ -64,7 +62,7 @@ public class CommandExecutor implements ServerCommand { } @Override - public ShutdownMethod shutdown() { + public boolean shutdown() { return shutdown; } } diff --git a/src/main/java/com/google/devtools/build/lib/runtime/commands/CleanCommand.java b/src/main/java/com/google/devtools/build/lib/runtime/commands/CleanCommand.java index dc08954b57..368529e804 100644 --- a/src/main/java/com/google/devtools/build/lib/runtime/commands/CleanCommand.java +++ b/src/main/java/com/google/devtools/build/lib/runtime/commands/CleanCommand.java @@ -20,7 +20,6 @@ import com.google.devtools.build.lib.buildtool.OutputDirectoryLinksUtils; import com.google.devtools.build.lib.events.Event; import com.google.devtools.build.lib.runtime.BlazeCommand; import com.google.devtools.build.lib.runtime.BlazeCommandDispatcher.ShutdownBlazeServerException; -import com.google.devtools.build.lib.runtime.BlazeCommandDispatcher.ShutdownMethod; import com.google.devtools.build.lib.runtime.Command; import com.google.devtools.build.lib.runtime.CommandEnvironment; import com.google.devtools.build.lib.shell.CommandException; @@ -208,6 +207,7 @@ public final class CleanCommand implements BlazeCommand { } if (cleanOptions.expunge) { LOG.info("Expunging..."); + env.getRuntime().prepareForAbruptShutdown(); // Delete the big subdirectories with the important content first--this // will take the most time. Then quickly delete the little locks, logs // and links right before we exit. Once the lock file is gone there will @@ -217,6 +217,7 @@ public final class CleanCommand implements BlazeCommand { FileSystemUtils.deleteTree(outputBase); } else if (cleanOptions.expunge_async) { LOG.info("Expunging asynchronously..."); + env.getRuntime().prepareForAbruptShutdown(); asyncClean(env, outputBase, "Output base"); } else { LOG.info("Output cleaning..."); @@ -242,7 +243,7 @@ public final class CleanCommand implements BlazeCommand { env.getRuntime().getProductName()); // shutdown on expunge cleans if (cleanOptions.expunge || cleanOptions.expunge_async) { - throw new ShutdownBlazeServerException(0, ShutdownMethod.EXPUNGE); + throw new ShutdownBlazeServerException(0); } System.gc(); } diff --git a/src/main/java/com/google/devtools/build/lib/runtime/commands/ShutdownCommand.java b/src/main/java/com/google/devtools/build/lib/runtime/commands/ShutdownCommand.java index 766bc002ab..640602d168 100644 --- a/src/main/java/com/google/devtools/build/lib/runtime/commands/ShutdownCommand.java +++ b/src/main/java/com/google/devtools/build/lib/runtime/commands/ShutdownCommand.java @@ -15,7 +15,6 @@ package com.google.devtools.build.lib.runtime.commands; import com.google.devtools.build.lib.runtime.BlazeCommand; import com.google.devtools.build.lib.runtime.BlazeCommandDispatcher.ShutdownBlazeServerException; -import com.google.devtools.build.lib.runtime.BlazeCommandDispatcher.ShutdownMethod; import com.google.devtools.build.lib.runtime.Command; import com.google.devtools.build.lib.runtime.CommandEnvironment; import com.google.devtools.build.lib.util.ExitCode; @@ -64,7 +63,7 @@ public final class ShutdownCommand implements BlazeCommand { if (limit == 0 || Runtime.getRuntime().totalMemory() > limit * 1000L * 1000) { - throw new ShutdownBlazeServerException(0, ShutdownMethod.CLEAN); + throw new ShutdownBlazeServerException(0); } return ExitCode.SUCCESS; } diff --git a/src/main/java/com/google/devtools/build/lib/server/GrpcServerImpl.java b/src/main/java/com/google/devtools/build/lib/server/GrpcServerImpl.java index ca7d3ea4f2..0fc7a06a64 100644 --- a/src/main/java/com/google/devtools/build/lib/server/GrpcServerImpl.java +++ b/src/main/java/com/google/devtools/build/lib/server/GrpcServerImpl.java @@ -437,6 +437,54 @@ public class GrpcServerImpl implements RPCServer { } } + /** + * A thread that watches if the PID file changes and shuts down the server immediately if so. + */ + private class PidFileWatcherThread extends Thread { + private boolean shuttingDown = false; + + private PidFileWatcherThread() { + super("pid-file-watcher"); + setDaemon(true); + } + + // The synchronized block is here so that if the "PID file deleted" timer kicks in during a + // regular shutdown, they don't race. + private synchronized void signalShutdown() { + shuttingDown = true; + } + + @Override + public void run() { + while (true) { + Uninterruptibles.sleepUninterruptibly(5, TimeUnit.SECONDS); + boolean ok = false; + try { + String pidFileContents = new String(FileSystemUtils.readContentAsLatin1(pidFile)); + ok = pidFileContents.equals(pidInFile); + } catch (IOException e) { + log.info("Cannot read PID file: " + e.getMessage()); + // Handled by virtue of ok not being set to true + } + + if (!ok) { + synchronized (PidFileWatcherThread.this) { + if (shuttingDown) { + log.warning("PID file deleted or overwritten but shutdown is already in progress"); + break; + } + + shuttingDown = true; + // Someone overwrote the PID file. Maybe it's another server, so shut down as quickly + // as possible without even running the shutdown hooks (that would delete it) + log.severe("PID file deleted or overwritten, exiting as quickly as possible"); + Runtime.getRuntime().halt(ExitCode.BLAZE_INTERNAL_ERROR.getNumericExitCode()); + } + } + } + } + } + // These paths are all relative to the server directory private static final String PORT_FILE = "command_port"; private static final String REQUEST_COOKIE_FILE = "request_cookie"; @@ -455,6 +503,9 @@ public class GrpcServerImpl implements RPCServer { private final String responseCookie; private final AtomicLong interruptCounter = new AtomicLong(0); private final int maxIdleSeconds; + private final PidFileWatcherThread pidFileWatcherThread; + private final Path pidFile; + private final String pidInFile; private Server server; private final int port; @@ -465,7 +516,8 @@ public class GrpcServerImpl implements RPCServer { // server.pid was written in the C++ launcher after fork() but before exec() . // The client only accesses the pid file after connecting to the socket // which ensures that it gets the correct pid value. - Path pidFile = serverDirectory.getRelative("server.pid.txt"); + pidFile = serverDirectory.getRelative("server.pid.txt"); + pidInFile = new String(FileSystemUtils.readContentAsLatin1(pidFile)); deleteAtExit(pidFile, /*deleteParent=*/ false); this.commandExecutor = commandExecutor; @@ -486,6 +538,9 @@ public class GrpcServerImpl implements RPCServer { SecureRandom random = new SecureRandom(); requestCookie = generateCookie(random, 16); responseCookie = generateCookie(random, 16); + + pidFileWatcherThread = new PidFileWatcherThread(); + pidFileWatcherThread.start(); } private static String generateCookie(SecureRandom random, int byteCount) { @@ -566,6 +621,32 @@ public class GrpcServerImpl implements RPCServer { server.shutdown(); } + /** + * This is called when the server is shut down as a result of a "clean --expunge". + * + * <p>In this case, no files should be deleted on shutdown hooks, since clean also deletes the + * lock file, and there is a small possibility of the following sequence of events: + * + * <ol> + * <li> Client 1 runs "blaze clean --expunge" + * <li> Client 2 runs a command and waits for client 1 to finish + * <li> The clean command deletes everything including the lock file + * <li> Client 2 starts running and since the output base is empty, starts up a new server, + * which creates its own socket and PID files + * <li> The server used by client runs its shutdown hooks, deleting the PID files created by + * the new server + * </ol> + * + * It also disables the "die when the PID file changes" handler so that it doesn't kill the server + * while the "clean --expunge" commmand is running. + */ + + @Override + public void prepareForAbruptShutdown() { + disableShutdownHooks(); + pidFileWatcherThread.signalShutdown(); + } + @Override public void interrupt() { synchronized (runningCommands) { @@ -770,18 +851,9 @@ public class GrpcServerImpl implements RPCServer { commandId, e.getMessage())); } - switch (commandExecutor.shutdown()) { - case NONE: - break; - - case CLEAN: - server.shutdown(); - break; - - case EXPUNGE: - disableShutdownHooks(); - server.shutdown(); - break; + if (commandExecutor.shutdown()) { + pidFileWatcherThread.signalShutdown(); + server.shutdown(); } } diff --git a/src/main/java/com/google/devtools/build/lib/server/RPCServer.java b/src/main/java/com/google/devtools/build/lib/server/RPCServer.java index 8b7aefbef4..449bb9761f 100644 --- a/src/main/java/com/google/devtools/build/lib/server/RPCServer.java +++ b/src/main/java/com/google/devtools/build/lib/server/RPCServer.java @@ -41,4 +41,12 @@ public interface RPCServer { * Called when the server receives a SIGINT. */ void interrupt(); + + /** + * Prepares for the server shutting down prematurely. + * + * <p>Used in <code>clean --expunge</code> where the server state is deleted from the disk and + * we need to make sure that everything works during such an drastic measure. + */ + void prepareForAbruptShutdown(); } diff --git a/src/main/java/com/google/devtools/build/lib/server/ServerCommand.java b/src/main/java/com/google/devtools/build/lib/server/ServerCommand.java index e2d6d85d72..7b0a075f75 100644 --- a/src/main/java/com/google/devtools/build/lib/server/ServerCommand.java +++ b/src/main/java/com/google/devtools/build/lib/server/ServerCommand.java @@ -14,9 +14,7 @@ package com.google.devtools.build.lib.server; import com.google.devtools.build.lib.runtime.BlazeCommandDispatcher; -import com.google.devtools.build.lib.runtime.BlazeCommandDispatcher.ShutdownMethod; import com.google.devtools.build.lib.util.io.OutErr; - import java.util.List; /** @@ -33,8 +31,7 @@ public interface ServerCommand { String clientDescription, long firstContactTime) throws InterruptedException; /** - * Whether the server needs to be shutdown, and if so, in what manner. + * Whether the server needs to be shut down. */ - ShutdownMethod shutdown(); - + boolean shutdown(); } |