aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar mschaller <mschaller@google.com>2017-07-11 18:21:36 +0200
committerGravatar László Csomor <laszlocsomor@google.com>2017-07-12 08:48:38 +0200
commitfd37b512d3a899da531cb6ba2cc70543246940fe (patch)
treea0fdb66c7c9d0cde4b1e72e56d068fe9f4bf2575 /src
parent937350211dcd55a4714ec32ebbf33fffcc42cdf2 (diff)
Ensure that shutdown commands end the server process before completion
This change ensures that the server process is terminated before the client process terminates, when evaluating a command that shuts down the server. When completing such a command, the server communicates to the client that the server will terminate itself by setting a termination_expected bit in the final RunResponse message. The client then waits up to 60s for the server process to actually terminate. If it does not, then the client SIGKILLs the server. Also makes the gRPC server stop accepting new commands before the shutdown command completes. Drive-by fix to comments on Search{Un,Null}aryOption. RELNOTES: Commands that shut down the server (like "shutdown") now ensure that the server process has terminated before the client process terminates. PiperOrigin-RevId: 161537480
Diffstat (limited to 'src')
-rw-r--r--src/main/cpp/blaze.cc30
-rw-r--r--src/main/cpp/blaze_util.cc45
-rw-r--r--src/main/cpp/blaze_util.h24
-rw-r--r--src/main/cpp/blaze_util_darwin.cc15
-rw-r--r--src/main/cpp/blaze_util_freebsd.cc15
-rw-r--r--src/main/cpp/blaze_util_linux.cc19
-rw-r--r--src/main/cpp/blaze_util_platform.h10
-rw-r--r--src/main/cpp/blaze_util_posix.cc24
-rw-r--r--src/main/cpp/blaze_util_windows.cc23
-rw-r--r--src/main/java/com/google/devtools/build/lib/server/GrpcServerImpl.java10
-rw-r--r--src/main/protobuf/command_server.proto4
11 files changed, 154 insertions, 65 deletions
diff --git a/src/main/cpp/blaze.cc b/src/main/cpp/blaze.cc
index 6809b0b7f7..3571b236d2 100644
--- a/src/main/cpp/blaze.cc
+++ b/src/main/cpp/blaze.cc
@@ -731,9 +731,8 @@ static void StartServerAndConnect(const WorkspaceLayout *workspace_layout,
// disaster.
int server_pid = GetServerPid(server_dir);
if (server_pid > 0) {
- if (VerifyServerProcess(server_pid, globals->options->output_base,
- globals->options->install_base)) {
- if (KillServerProcess(server_pid)) {
+ if (VerifyServerProcess(server_pid, globals->options->output_base)) {
+ if (KillServerProcess(server_pid, globals->options->output_base)) {
fprintf(stderr, "Killed non-responsive server process (pid=%d)\n",
server_pid);
SetRestartReasonIfNotSet(SERVER_UNRESPONSIVE);
@@ -1458,8 +1457,7 @@ bool GrpcBlazeServer::Connect() {
return false;
}
- if (!VerifyServerProcess(server_pid, globals->options->output_base,
- globals->options->install_base)) {
+ if (!VerifyServerProcess(server_pid, globals->options->output_base)) {
return false;
}
@@ -1587,11 +1585,13 @@ void GrpcBlazeServer::KillRunningServer() {
while (reader->Read(&response)) {
}
- // Kill the server process for good measure (if we know the server PID)
+ // Wait for the server process to terminate (if we know the server PID).
+ // If it does not terminate itself gracefully within 1m, terminate it.
if (globals->server_pid > 0 &&
- VerifyServerProcess(globals->server_pid, globals->options->output_base,
- globals->options->install_base)) {
- KillServerProcess(globals->server_pid);
+ !AwaitServerProcessTermination(globals->server_pid,
+ globals->options->output_base,
+ kPostShutdownGracePeriodSeconds)) {
+ KillServerProcess(globals->server_pid, globals->options->output_base);
}
connected_ = false;
@@ -1599,6 +1599,7 @@ void GrpcBlazeServer::KillRunningServer() {
unsigned int GrpcBlazeServer::Communicate() {
assert(connected_);
+ assert(globals->server_pid > 0);
vector<string> arg_vector;
string command = globals->option_processor->GetCommand();
@@ -1643,6 +1644,7 @@ unsigned int GrpcBlazeServer::Communicate() {
int exit_code = -1;
bool finished = false;
bool finished_warning_emitted = false;
+ bool termination_expected = false;
while (reader->Read(&response)) {
if (finished && !finished_warning_emitted) {
@@ -1659,6 +1661,7 @@ unsigned int GrpcBlazeServer::Communicate() {
if (response.finished()) {
exit_code = response.exit_code();
+ termination_expected = response.termination_expected();
finished = true;
}
@@ -1694,6 +1697,15 @@ unsigned int GrpcBlazeServer::Communicate() {
}
}
+ // If the server has shut down, but does not terminate itself within a 1m
+ // grace period, terminate it.
+ if (termination_expected &&
+ !AwaitServerProcessTermination(globals->server_pid,
+ globals->options->output_base,
+ kPostShutdownGracePeriodSeconds)) {
+ KillServerProcess(globals->server_pid, globals->options->output_base);
+ }
+
SendAction(CancelThreadAction::JOIN);
cancel_thread.join();
diff --git a/src/main/cpp/blaze_util.cc b/src/main/cpp/blaze_util.cc
index 2831ead175..be3e0ebfd9 100644
--- a/src/main/cpp/blaze_util.cc
+++ b/src/main/cpp/blaze_util.cc
@@ -37,6 +37,10 @@ using std::vector;
const char kServerPidFile[] = "server.pid.txt";
+const unsigned int kPostShutdownGracePeriodSeconds = 60;
+
+const unsigned int kPostKillGracePeriodSeconds = 10;
+
string MakeAbsolute(const string &path) {
if (path.empty()) {
return blaze_util::GetCwd();
@@ -163,4 +167,45 @@ bool IsArg(const string& arg) {
&& (arg != "-help") && (arg != "-h");
}
+void LogWait(unsigned int elapsed_seconds, unsigned int wait_seconds) {
+ SigPrintf("WARNING: Waiting for server process to terminate "
+ "(waited %d seconds, waiting at most %d)\n",
+ elapsed_seconds, wait_seconds);
+}
+
+bool AwaitServerProcessTermination(int pid, const string& output_base,
+ unsigned int wait_seconds) {
+ uint64_t st = GetMillisecondsMonotonic();
+ const unsigned int first_seconds = 5;
+ bool logged_first = false;
+ const unsigned int second_seconds = 10;
+ bool logged_second = false;
+ const unsigned int third_seconds = 30;
+ bool logged_third = false;
+
+ while (VerifyServerProcess(pid, output_base)) {
+ TrySleep(100);
+ uint64_t elapsed_millis = GetMillisecondsMonotonic() - st;
+ if (!logged_first && elapsed_millis > first_seconds * 1000) {
+ LogWait(first_seconds, wait_seconds);
+ logged_first = true;
+ }
+ if (!logged_second && elapsed_millis > second_seconds * 1000) {
+ LogWait(second_seconds, wait_seconds);
+ logged_second = true;
+ }
+ if (!logged_third && elapsed_millis > third_seconds * 1000) {
+ LogWait(third_seconds, wait_seconds);
+ logged_third = true;
+ }
+ if (elapsed_millis > wait_seconds * 1000) {
+ SigPrintf("INFO: Waited %d seconds for server process (pid=%d) to"
+ " terminate.\n",
+ wait_seconds, pid);
+ return false;
+ }
+ }
+ return true;
+}
+
} // namespace blaze
diff --git a/src/main/cpp/blaze_util.h b/src/main/cpp/blaze_util.h
index 4c8d6c291e..f2c3a2f187 100644
--- a/src/main/cpp/blaze_util.h
+++ b/src/main/cpp/blaze_util.h
@@ -52,13 +52,14 @@ bool GetNullaryOption(const char *arg, const char *key);
// Searches for 'key' in 'args' using GetUnaryOption. Arguments found after '--'
// are omitted from the search.
-// Returns true iff key is a flag in args.
+// Returns the value of the 'key' flag iff it occurs in args.
+// Returns NULL otherwise.
const char* SearchUnaryOption(const std::vector<std::string>& args,
const char* key);
// Searches for 'key' in 'args' using GetNullaryOption. Arguments found after
// '--' are omitted from the search.
-// Returns the value of the 'key' flag iff it occurs in args.
+// Returns true iff key is a flag in args.
bool SearchNullaryOption(const std::vector<std::string>& args,
const char* key);
@@ -82,6 +83,25 @@ bool CheckJavaVersionIsAtLeast(const std::string &jvm_version,
// Returns true iff arg is a valid command line argument for bazel.
bool IsArg(const std::string& arg);
+// Wait to see if the server process terminates. Checks the server's status
+// immediately, and repeats the check every 100ms until approximately
+// wait_seconds elapses or the server process terminates. Returns true if a
+// check sees that the server process terminated. Logs to stderr after 5, 10,
+// and 30 seconds if the wait lasts that long.
+bool AwaitServerProcessTermination(int pid, const std::string& output_base,
+ unsigned int wait_seconds);
+
+// The number of seconds the client will wait for the server process to
+// terminate itself after the client receives the final response from a command
+// that shuts down the server. After waiting this time, if the server process
+// remains, the client will forcibly kill the server.
+extern const unsigned int kPostShutdownGracePeriodSeconds;
+
+// The number of seconds the client will wait for the server process to
+// terminate after the client forcibly kills the server. After waiting this
+// time, if the server process remains, the client will die.
+extern const unsigned int kPostKillGracePeriodSeconds;
+
// Returns the string representation of `value`.
// Workaround for mingw where std::to_string is not implemented.
// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52015.
diff --git a/src/main/cpp/blaze_util_darwin.cc b/src/main/cpp/blaze_util_darwin.cc
index 075b48a1e0..008b721e0a 100644
--- a/src/main/cpp/blaze_util_darwin.cc
+++ b/src/main/cpp/blaze_util_darwin.cc
@@ -196,16 +196,11 @@ void WriteSystemSpecificProcessIdentifier(
const string& server_dir, pid_t server_pid) {
}
-bool VerifyServerProcess(
- int pid, const string& output_base, const string& install_base) {
- // TODO(lberki): This might accidentally kill an unrelated process if the
- // server died and the PID got reused.
- return true;
-}
-
-bool KillServerProcess(int pid) {
- killpg(pid, SIGKILL);
- return true;
+bool VerifyServerProcess(int pid, const string &output_base) {
+ // TODO(lberki): This only checks for the process's existence, not whether
+ // its start time matches. Therefore this might accidentally kill an
+ // unrelated process if the server died and the PID got reused.
+ return killpg(pid, 0) == 0;
}
// Sets a flag on path to exclude the path from Apple's automatic backup service
diff --git a/src/main/cpp/blaze_util_freebsd.cc b/src/main/cpp/blaze_util_freebsd.cc
index 213af024e5..4ff2278b54 100644
--- a/src/main/cpp/blaze_util_freebsd.cc
+++ b/src/main/cpp/blaze_util_freebsd.cc
@@ -152,16 +152,11 @@ void WriteSystemSpecificProcessIdentifier(
const string& server_dir, pid_t server_pid) {
}
-bool VerifyServerProcess(
- int pid, const string& output_base, const string& install_base) {
- // TODO(lberki): This might accidentally kill an unrelated process if the
- // server died and the PID got reused.
- return true;
-}
-
-bool KillServerProcess(int pid) {
- killpg(pid, SIGKILL);
- return true;
+bool VerifyServerProcess(int pid, const string &output_base) {
+ // TODO(lberki): This only checks for the process's existence, not whether
+ // its start time matches. Therefore this might accidentally kill an
+ // unrelated process if the server died and the PID got reused.
+ return killpg(pid, 0) == 0;
}
// Not supported.
diff --git a/src/main/cpp/blaze_util_linux.cc b/src/main/cpp/blaze_util_linux.cc
index 2e5e07c04f..72eccb2d9a 100644
--- a/src/main/cpp/blaze_util_linux.cc
+++ b/src/main/cpp/blaze_util_linux.cc
@@ -234,8 +234,7 @@ void WriteSystemSpecificProcessIdentifier(
// On Linux we use a combination of PID and start time to identify the server
// process. That is supposed to be unique unless one can start more processes
// than there are PIDs available within a single jiffy.
-bool VerifyServerProcess(
- int pid, const string& output_base, const string& install_base) {
+bool VerifyServerProcess(int pid, const string& output_base) {
string start_time;
if (!GetStartTime(ToString(pid), &start_time)) {
// Cannot read PID file from /proc . Process died meantime, all is good. No
@@ -253,22 +252,6 @@ bool VerifyServerProcess(
return !file_present || recorded_start_time == start_time;
}
-bool KillServerProcess(int pid) {
- // Kill the process and make sure it's dead before proceeding.
- killpg(pid, SIGKILL);
- int check_killed_retries = 10;
- while (killpg(pid, 0) == 0) {
- if (check_killed_retries-- > 0) {
- sleep(1);
- } else {
- die(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR,
- "Attempted to kill stale blaze server process (pid=%d) using "
- "SIGKILL, but it did not die in a timely fashion.", pid);
- }
- }
- return true;
-}
-
// Not supported.
void ExcludePathFromBackup(const string &path) {
}
diff --git a/src/main/cpp/blaze_util_platform.h b/src/main/cpp/blaze_util_platform.h
index 05eb772062..b842ae8385 100644
--- a/src/main/cpp/blaze_util_platform.h
+++ b/src/main/cpp/blaze_util_platform.h
@@ -168,13 +168,17 @@ uint64_t AcquireLock(const std::string& output_base, bool batch_mode,
void ReleaseLock(BlazeLock* blaze_lock);
// Verifies whether the server process still exists. Returns true if it does.
-bool VerifyServerProcess(int pid, const std::string& output_base,
- const std::string& install_base);
+bool VerifyServerProcess(int pid, const std::string& output_base);
// Kills a server process based on its PID.
// Returns true if the server process was found and killed.
// WARNING! This function can be called from a signal handler!
-bool KillServerProcess(int pid);
+bool KillServerProcess(int pid, const std::string& output_base);
+
+// Wait for approximately the specified number of milliseconds. The actual
+// amount of time waited may be more or less because of interrupts or system
+// clock resolution.
+void TrySleep(unsigned int milliseconds);
// Mark path as being excluded from backups (if supported by operating system).
void ExcludePathFromBackup(const std::string& path);
diff --git a/src/main/cpp/blaze_util_posix.cc b/src/main/cpp/blaze_util_posix.cc
index af0ce17385..f5c7241db8 100644
--- a/src/main/cpp/blaze_util_posix.cc
+++ b/src/main/cpp/blaze_util_posix.cc
@@ -72,7 +72,9 @@ static void handler(int signum) {
"\n%s caught third interrupt signal; killed.\n\n",
SignalHandler::Get().GetGlobals()->options->product_name.c_str());
if (SignalHandler::Get().GetGlobals()->server_pid != -1) {
- KillServerProcess(SignalHandler::Get().GetGlobals()->server_pid);
+ KillServerProcess(
+ SignalHandler::Get().GetGlobals()->server_pid,
+ SignalHandler::Get().GetGlobals()->options->output_base);
}
_exit(1);
}
@@ -630,6 +632,26 @@ void ReleaseLock(BlazeLock* blaze_lock) {
close(blaze_lock->lockfd);
}
+bool KillServerProcess(int pid, const string& output_base) {
+ // Kill the process and make sure it's dead before proceeding.
+ killpg(pid, SIGKILL);
+ if (!AwaitServerProcessTermination(pid, output_base,
+ kPostKillGracePeriodSeconds)) {
+ die(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR,
+ "Attempted to kill stale server process (pid=%d) using "
+ "SIGKILL, but it did not die in a timely fashion.",
+ pid);
+ }
+ return true;
+}
+
+void TrySleep(unsigned int milliseconds) {
+ unsigned int seconds_part = milliseconds / 1000;
+ unsigned int nanoseconds_part = (milliseconds % 1000) * 1000 * 1000;
+ struct timespec sleeptime = {seconds_part, nanoseconds_part};
+ nanosleep(&sleeptime, NULL);
+}
+
string GetUserName() {
string user = GetEnv("USER");
if (!user.empty()) {
diff --git a/src/main/cpp/blaze_util_windows.cc b/src/main/cpp/blaze_util_windows.cc
index a5bcf45450..802b3c2395 100644
--- a/src/main/cpp/blaze_util_windows.cc
+++ b/src/main/cpp/blaze_util_windows.cc
@@ -122,7 +122,9 @@ BOOL WINAPI ConsoleCtrlHandler(_In_ DWORD ctrlType) {
"\n%s caught third Ctrl+C handler signal; killed.\n\n",
SignalHandler::Get().GetGlobals()->options->product_name.c_str());
if (SignalHandler::Get().GetGlobals()->server_pid != -1) {
- KillServerProcess(SignalHandler::Get().GetGlobals()->server_pid);
+ KillServerProcess(
+ SignalHandler::Get().GetGlobals()->server_pid,
+ SignalHandler::Get().GetGlobals()->options->output_base);
}
_exit(1);
}
@@ -171,7 +173,9 @@ static void handler(int signum) {
"\n%s caught third interrupt signal; killed.\n\n",
SignalHandler::Get().GetGlobals()->options->product_name.c_str());
if (SignalHandler::Get().GetGlobals()->server_pid != -1) {
- KillServerProcess(SignalHandler::Get().GetGlobals()->server_pid);
+ KillServerProcess(
+ SignalHandler::Get().GetGlobals()->server_pid,
+ SignalHandler::Get().GetGlobals()->options->output_base);
}
_exit(1);
}
@@ -1018,8 +1022,7 @@ bool CompareAbsolutePaths(const string& a, const string& b) {
// On Windows (and Linux) we use a combination of PID and start time to identify
// the server process. That is supposed to be unique unless one can start more
// processes than there are PIDs available within a single jiffy.
-bool VerifyServerProcess(
- int pid, const string& output_base, const string& install_base) {
+bool VerifyServerProcess(int pid, const string& output_base) {
AutoHandle process(
::OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid));
if (!process.IsValid()) {
@@ -1046,7 +1049,7 @@ bool VerifyServerProcess(
return !file_present || recorded_start_time == ToString(start_time);
}
-bool KillServerProcess(int pid) {
+bool KillServerProcess(int pid, const string& output_base) {
AutoHandle process(::OpenProcess(
PROCESS_TERMINATE | PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid));
DWORD exitcode = 0;
@@ -1058,12 +1061,18 @@ bool KillServerProcess(int pid) {
}
BOOL result = TerminateProcess(process, /*uExitCode*/ 0);
- if (!result) {
- blaze_util::PrintError("Cannot terminate server process with PID %d", pid);
+ if (!result || !AwaitServerProcessTermination(pid, output_base,
+ kPostKillGracePeriodSeconds)) {
+ pdie(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR,
+ "Cannot terminate server process with PID %d", pid);
}
return result;
}
+void TrySleep(unsigned int milliseconds) {
+ Sleep(milliseconds);
+}
+
// Not supported.
void ExcludePathFromBackup(const string &path) {
}
diff --git a/src/main/java/com/google/devtools/build/lib/server/GrpcServerImpl.java b/src/main/java/com/google/devtools/build/lib/server/GrpcServerImpl.java
index 032da198d3..6abef0e4ac 100644
--- a/src/main/java/com/google/devtools/build/lib/server/GrpcServerImpl.java
+++ b/src/main/java/com/google/devtools/build/lib/server/GrpcServerImpl.java
@@ -860,12 +860,17 @@ public class GrpcServerImpl implements RPCServer {
// the cancel request won't find the thread to interrupt)
Thread.interrupted();
+ boolean shutdown = commandExecutor.shutdown();
+ if (shutdown) {
+ server.shutdown();
+ }
RunResponse response =
RunResponse.newBuilder()
.setCookie(responseCookie)
.setCommandId(commandId)
.setFinished(true)
.setExitCode(exitCode)
+ .setTerminationExpected(shutdown)
.build();
try {
@@ -876,11 +881,6 @@ public class GrpcServerImpl implements RPCServer {
log.info(String.format("Client cancelled command %s just right before its end: %s",
commandId, e.getMessage()));
}
-
- if (commandExecutor.shutdown()) {
- pidFileWatcherThread.signalShutdown();
- server.shutdown();
- }
}
private final CommandServerGrpc.CommandServerImplBase commandServer =
diff --git a/src/main/protobuf/command_server.proto b/src/main/protobuf/command_server.proto
index 3dfedd4c91..79f44b9930 100644
--- a/src/main/protobuf/command_server.proto
+++ b/src/main/protobuf/command_server.proto
@@ -77,6 +77,10 @@ message RunResponse {
// as soon as possible. This is not required to be set (non-empty) on every
// response.
string command_id = 6;
+
+ // Whether the command has shut down the server; if set, the client should
+ // wait until the server process dies before finishing.
+ bool termination_expected = 7;
}
// Passed to CommandServer.cancel to initiate graceful cancellation of an