From 1977d92e17e6bb640cc5947ff76826928c7d7419 Mon Sep 17 00:00:00 2001 From: Lukacs Berki Date: Mon, 2 May 2016 09:31:37 +0000 Subject: Various cleanups and refactorings in the client: - Made the control flow much simpler and more understandable - Added some documentation about the interplay of the client and the server - Abstracted out POSIX mechanisms from blaze.cc so that they can be implemented properly on Windows - Added assertions that the methods on BlazeServer are called when they should be Polish for #930. -- MOS_MIGRATED_REVID=121256601 --- src/main/cpp/blaze.cc | 331 ++++++++++++++++++++++----------- src/main/cpp/blaze_util_mingw.cc | 27 ++- src/main/cpp/blaze_util_platform.h | 23 ++- src/main/cpp/blaze_util_posix.cc | 41 +++- src/main/protobuf/command_server.proto | 5 +- 5 files changed, 295 insertions(+), 132 deletions(-) (limited to 'src/main') diff --git a/src/main/cpp/blaze.cc b/src/main/cpp/blaze.cc index 4fdc75f157..5a696583b2 100644 --- a/src/main/cpp/blaze.cc +++ b/src/main/cpp/blaze.cc @@ -98,19 +98,101 @@ namespace blaze { static void WriteFileToStreamOrDie(FILE *stream, const char *file_name); static string BuildServerRequest(); +static int GetServerPid(const string &server_dir); -// Implementation of the communication protocol with the server process. -// This object has two states: connected and disconnected. Most of the methods -// can only be called in one of these states. +// The following is a treatise on how the interaction between the client and the +// server works. +// +// First, the client unconditionally acquires an flock() lock on +// $OUTPUT_BASE/lock then verifies if it has already extracted itself by +// checking if the directory it extracts itself to (install base + a checksum) +// is present. If not, then it does the extraction. Care is taken that this +// process is atomic so that Blazen in multiple output bases do not clash. +// +// Then the client tries to connect to the currently executing server and kills +// it if at least one of the following conditions is true: +// +// - The server is of the wrong version (as determined by the +// $OUTPUT_BASE/install symlink) +// - The server has different startup options than the client wants +// - The client wants to run the command in batch mode +// +// Then, if needed, the client adjusts the install link to indicate which +// version of the server it is running. +// +// In batch mode, the client then simply executes the server while taking care +// that the output base lock is kept until it finishes. +// +// If in server mode, the client starts up a server if needed then sends the +// command to the client and streams back stdout and stderr. If an AF_UNIX +// socket is used, the output base lock is held until the command finishes. If +// gRPC is used, the lock is released after the command is sent to the server +// (the server implements its own locking mechanism) + +// Synchronization between the client and the server is a little precarious +// because the client needs to know the PID of the server and it is not +// available using a Java API and we don't have JNI on Windows at the moment, +// so the server can't just communicate this over the communication channel. +// Thus, a PID file is used, but care needs to be taken that the contents of +// this PID file are right. +// +// Upon server startup, the PID file is written before the client spawns the +// server. Thus, when the client can connect, it can be certain that the PID +// file is up to date. +// +// Upon server shutdown, the PID file is deleted using a server shutdown hook. +// However, this happens *after* the server stopped listening, so it's possible +// that a client has already started up a server and written a new PID file. +// In order to avoid this, when the client starts up a new server, it reads the +// contents of the PID file and kills the process indicated in it (it could do +// with a bit more care, since PIDs can be reused, but for now, we just believe +// the PID file) +// +// Some more interesting scenarios: +// +// - The server receives a kill signal and it does not have a chance to delete +// the PID file: the client cannot connect, reads the PID file, kills the +// process indicated in it and starts up a new server. +// +// - The server stopped accepting connections but hasn't quit yet and a new +// client comes around: the new client will kill the server based on the +// PID file before a new server is started up. +// +// Alternative implementations: +// +// - Don't deal with PIDs at all. This would make it impossible for the client +// to deliver a SIGKILL to the server after three SIGINTs. It would only be +// possible with gRPC anyway. +// +// - Have the server check that the PID file containts the correct things +// before deleting them: there is a window of time between checking the file +// and deleting it in which a new server can overwrite the PID file. The +// output base lock cannot be acquired, either, because when starting up a +// new server, the client already holds it. +// +// - Delete the PID file before stopping to accept connections: then a client +// could come about after deleting the PID file but before stopping accepting +// connections. It would also not be resilient against a dead server that +// left a PID file around. +// +// - The communication method is changed between AF_UNIX and gRPC: the client +// will find that the server is not responsive and will kill it based on its +// PID. class BlazeServer { protected: - BlazeLock blaze_lock; + BlazeLock blaze_lock_; + bool connected_; public: virtual ~BlazeServer() {} + // Acquire a lock for the server running in this output base. Returns the + // number of milliseconds spent waiting for the lock. uint64_t AcquireLock(); + // Whether there is an active connection to a server. + bool Connected() const { return connected_; } + // Connect to the server. Returns if the connection was successful. Only // call this when this object is in disconnected state. If it returns true, // this object will be in connected state. @@ -128,10 +210,11 @@ class BlazeServer { // Disconnects and kills an existing server. Only call this when this object // is in connected state. - virtual void KillRunningServer(int server_pid) = 0; + virtual void KillRunningServer() = 0; // Cancel the currently running command. If there is no command currently - // running, the result is unspecified. + // running, the result is unspecified. When called, this object must be in + // connected state. virtual void Cancel() = 0; }; @@ -151,13 +234,13 @@ static void InitGlobals() { globals->restart_reason = NO_RESTART; } - uint64_t BlazeServer::AcquireLock() { return blaze::AcquireLock( globals->options.output_base, globals->options.batch, - globals->options.block_for_lock, &blaze_lock); + globals->options.block_for_lock, &blaze_lock_); } +// Communication method that uses an AF_UNIX socket and a custom protocol. class AfUnixBlazeServer : public BlazeServer { public: AfUnixBlazeServer(); @@ -166,13 +249,15 @@ class AfUnixBlazeServer : public BlazeServer { bool Connect() override; void Disconnect() override; unsigned int Communicate() override; - void KillRunningServer(int server_pid) override; + void KillRunningServer() override; void Cancel() override; private: int server_socket_; }; +// Communication method that uses gRPC on a socket bound to localhost. More +// documentation is in command_server.proto . class GrpcBlazeServer : public BlazeServer { public: GrpcBlazeServer(); @@ -181,7 +266,7 @@ class GrpcBlazeServer : public BlazeServer { bool Connect() override; void Disconnect() override; unsigned int Communicate() override; - void KillRunningServer(int server_pid) override; + void KillRunningServer() override; void Cancel() override; private: @@ -551,7 +636,7 @@ static string VerifyJavaVersionAndGetJvm() { // Starts the Blaze server. Returns a readable fd connected to the server. // This is currently used only to detect liveness. -static int StartServer() { +static void StartServer(BlazeServerStartup** server_startup) { vector jvm_args_vector = GetArgumentArray(); string argument_string = GetArgumentString(jvm_args_vector); string server_dir = globals->options.output_base + "/server"; @@ -574,19 +659,19 @@ static int StartServer() { // we can still print errors to the terminal. GoToWorkspace(); - return ExecuteDaemon(exe, jvm_args_vector, globals->jvm_log_file.c_str(), - server_dir); + ExecuteDaemon(exe, jvm_args_vector, globals->jvm_log_file.c_str(), + server_dir, server_startup); } -static bool KillRunningServerIfAny(BlazeServer *server); - // Replace this process with blaze in standalone/batch mode. // The batch mode blaze process handles the command and exits. // // This function passes the commands array to the blaze process. // This array should start with a command ("build", "info", etc.). static void StartStandalone(BlazeServer* server) { - KillRunningServerIfAny(server); + if (server->Connected()) { + server->KillRunningServer(); + } // Wall clock time since process startup. globals->startup_time = ProcessClock() / 1000000LL; @@ -629,9 +714,12 @@ static void StartStandalone(BlazeServer* server) { AfUnixBlazeServer::AfUnixBlazeServer() { server_socket_ = -1; + connected_ = false; } bool AfUnixBlazeServer::Connect() { + assert(!connected_); + if (server_socket_ == -1) { server_socket_ = socket(PF_UNIX, SOCK_STREAM, 0); if (server_socket_ == -1) { @@ -656,7 +744,18 @@ bool AfUnixBlazeServer::Connect() { free(resolved_path); sockaddr *paddr = reinterpret_cast(&addr); int result = connect(server_socket_, paddr, sizeof addr); - return result == 0; + connected_ = result == 0; + if (connected_) { + string server_dir = globals->options.output_base + "/server"; + globals->server_pid = GetServerPid(server_dir); + if (globals->server_pid <= 0) { + pdie(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR, + "can't get PID of existing server (server dir=%s)", + server_dir.c_str()); + } + } + + return connected_; } else if (errno == ENOENT) { // No socket means no server to connect to errno = ECONNREFUSED; return false; @@ -668,7 +767,10 @@ bool AfUnixBlazeServer::Connect() { } void AfUnixBlazeServer::Disconnect() { + assert(connected_); + close(server_socket_); + connected_ = false; server_socket_ = -1; } @@ -736,6 +838,8 @@ static int ForwardServerOutput(int socket, int output) { } unsigned int AfUnixBlazeServer::Communicate() { + assert(connected_); + const string request = BuildServerRequest(); // Send request (Request is written in a single chunk.) @@ -839,6 +943,7 @@ unsigned int AfUnixBlazeServer::Communicate() { } void AfUnixBlazeServer::Cancel() { + assert(connected_); kill(globals->server_pid, SIGINT); } @@ -898,10 +1003,8 @@ static int GetServerPid(const string &server_dir) { return result; } -// Connects to the Blaze server, returning the socket, or -1 if no -// server is running and !start. If start, attempts to start a new -// server, and exits on failure. -static bool ConnectToServer(BlazeServer *server, bool start) { +// Starts up a new server and connects to it. Exits if it didn't work not. +static void StartServerAndConnect(BlazeServer *server) { string server_dir = globals->options.output_base + "/server"; // The server dir has the socket, so we don't allow access by other @@ -913,70 +1016,49 @@ static bool ConnectToServer(BlazeServer *server, bool start) { string socket_file = blaze_util::JoinPath(server_dir, "server.socket"); - globals->server_pid = 0; - if (server->Connect()) { - globals->server_pid = GetServerPid(server_dir); - if (globals->server_pid == -1) { - pdie(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR, - "can't get PID of existing server (server dir=%s)", - server_dir.c_str()); - } - return true; - } - - if (start) { - // If we couldn't connect to the server check if there is still a PID file - // and if so, kill the server that wrote it. This can happen e.g. if the - // server is in a GC pause and therefore cannot respond to ping requests and - // having two server instances running in the same output base is a - // disaster. - int server_pid = GetServerPid(server_dir); - if (server_pid >= 0) { - killpg(server_pid, SIGKILL); - } - - SetScheduling(globals->options.batch_cpu_scheduling, - globals->options.io_nice_level); - - int fd = StartServer(); - if (fd != -1 && fcntl(fd, F_SETFL, O_NONBLOCK | fcntl(fd, F_GETFL))) { - pdie(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR, - "Failed: fcntl to enable O_NONBLOCK on pipe"); - } - // Give the server one minute to start up. - for (int ii = 0; ii < 600; ++ii) { - // 60s; enough time to connect with debugger - if (server->Connect()) { - if (ii) { - fputc('\n', stderr); - fflush(stderr); - } - globals->server_pid = GetServerPid(server_dir); - if (globals->server_pid == -1) { - pdie(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR, - "can't get pid of fresh server from connection (dir=%s)", - server_dir.c_str()); - } - return true; - } - fputc('.', stderr); - fflush(stderr); - poll(NULL, 0, 1000); // sleep 100ms. (usleep(3) is obsolete.) - char c; - if (fd != -1 && (read(fd, &c, 1) != -1 || errno != EAGAIN)) { - fprintf(stderr, "\nunexpected pipe read status: %s\n" - "Server presumed dead. Now printing '%s':\n", - strerror(errno), globals->jvm_log_file.c_str()); - WriteFileToStreamOrDie(stderr, globals->jvm_log_file.c_str()); - exit(blaze_exit_code::INTERNAL_ERROR); + // If we couldn't connect to the server check if there is still a PID file + // and if so, kill the server that wrote it. This can happen e.g. if the + // server is in a GC pause and therefore cannot respond to ping requests and + // having two server instances running in the same output base is a + // disaster. + int server_pid = GetServerPid(server_dir); + if (server_pid > 0) { + fprintf(stderr, + "Found non-responsive server process (pid=%d). Killing it.\n", + server_pid); + KillServerProcess(server_pid, globals->options.output_base); + } + + SetScheduling(globals->options.batch_cpu_scheduling, + globals->options.io_nice_level); + + BlazeServerStartup* server_startup; + StartServer(&server_startup); + // Give the server one minute to start up. + for (int ii = 0; ii < 600; ++ii) { + // 60s; enough time to connect with debugger + if (server->Connect()) { + if (ii) { + fputc('\n', stderr); + fflush(stderr); } + delete server_startup; + return; + } + fputc('.', stderr); + fflush(stderr); + poll(NULL, 0, 1000); // sleep 100ms. (usleep(3) is obsolete.) + if (!server_startup->IsStillAlive()) { + fprintf(stderr, "\nunexpected pipe read status: %s\n" + "Server presumed dead. Now printing '%s':\n", + strerror(errno), globals->jvm_log_file.c_str()); + WriteFileToStreamOrDie(stderr, globals->jvm_log_file.c_str()); + exit(blaze_exit_code::INTERNAL_ERROR); } - die(blaze_exit_code::INTERNAL_ERROR, - "\nError: couldn't connect to server at '%s' after 60 seconds.", - socket_file.c_str()); - // The if never falls through here. } - return false; + die(blaze_exit_code::INTERNAL_ERROR, + "\nError: couldn't connect to server at '%s' after 60 seconds.", + socket_file.c_str()); } // Poll until the given process denoted by pid goes away. Return false if this @@ -994,27 +1076,33 @@ static bool WaitForServerDeath(pid_t pid, int wait_time_secs) { return false; } -// Kills the specified running Blaze server. -void AfUnixBlazeServer::KillRunningServer(pid_t server_pid) { +// Kills the specified running Blaze server. First we send a SIGTERM, and if +// that does not kill the process, a SIGKILL. +void AfUnixBlazeServer::KillRunningServer() { + assert(connected_); + assert(globals->server_pid > 0); + close(server_socket_); server_socket_ = -1; fprintf(stderr, "Sending SIGTERM to previous %s server (pid=%d)... ", - globals->options.GetProductName().c_str(), server_pid); + globals->options.GetProductName().c_str(), globals->server_pid); fflush(stderr); - kill(server_pid, SIGTERM); - if (WaitForServerDeath(server_pid, 10)) { + kill(globals->server_pid, SIGTERM); + if (WaitForServerDeath(globals->server_pid, 10)) { fprintf(stderr, "done.\n"); + connected_ = false; return; } // If the previous attempt did not suceeded, kill the whole group. fprintf(stderr, "Sending SIGKILL to previous %s server process group (pid=%d)... ", - globals->options.GetProductName().c_str(), server_pid); + globals->options.GetProductName().c_str(), globals->server_pid); fflush(stderr); - killpg(server_pid, SIGKILL); - if (WaitForServerDeath(server_pid, 10)) { + killpg(globals->server_pid, SIGKILL); + if (WaitForServerDeath(globals->server_pid, 10)) { fprintf(stderr, "killed.\n"); + connected_ = false; return; } @@ -1022,15 +1110,6 @@ void AfUnixBlazeServer::KillRunningServer(pid_t server_pid) { pdie(blaze_exit_code::INTERNAL_ERROR, "SIGKILL unsuccessful after 10s"); } -// Kills the running Blaze server, if any. Finds the pid from the socket. -static bool KillRunningServerIfAny(BlazeServer* server) { - if (ConnectToServer(server, /*start=*/false)) { - server->KillRunningServer(globals->server_pid); - return true; - } - return false; -} - // Calls fsync() on the file (or directory) specified in 'file_path'. // pdie()'s if syncing fails. static void SyncFile(const char *file_path) { @@ -1281,7 +1360,7 @@ static bool ServerNeedsToBeKilled(const vector& args1, continue; } - if (args1[i] !=args2[i]) { + if (args1[i] != args2[i]) { return true; } @@ -1296,7 +1375,7 @@ static bool ServerNeedsToBeKilled(const vector& args1, // Kills the running Blaze server, if any, if the startup options do not match. static void KillRunningServerIfDifferentStartupOptions(BlazeServer* server) { - if (!ConnectToServer(server, /*start=*/false)) { + if (!server->Connected()) { return; } @@ -1319,9 +1398,7 @@ static void KillRunningServerIfDifferentStartupOptions(BlazeServer* server) { "WARNING: Running %s server needs to be killed, because the " "startup options are different.\n", globals->options.GetProductName().c_str()); - server->KillRunningServer(globals->server_pid); - } else { - server->Disconnect(); + server->KillRunningServer(); } } @@ -1340,9 +1417,11 @@ static void EnsureCorrectRunningVersion(BlazeServer* server) { bool ok = ReadDirectorySymlink(installation_path.c_str(), &prev_installation); if (!ok || !CompareAbsolutePaths( prev_installation, globals->options.install_base)) { - if (KillRunningServerIfAny(server)) { - globals->restart_reason = NEW_VERSION; + if (server->Connected()) { + server->KillRunningServer(); } + + globals->restart_reason = NEW_VERSION; UnlinkPath(installation_path.c_str()); if (!SymlinkDirectories(globals->options.install_base.c_str(), installation_path.c_str())) { @@ -1444,7 +1523,10 @@ static string BuildServerRequest() { // shuts down the client (by exit or signal). static ATTRIBUTE_NORETURN void SendServerRequest(BlazeServer* server) { while (true) { - ConnectToServer(server, /*start=*/true); + if (!server->Connected()) { + StartServerAndConnect(server); + } + // Check for deleted server cwd: string server_cwd = GetProcessCWD(globals->server_pid); // TODO(bazel-team): Is this check even necessary? If someone deletes or @@ -1474,7 +1556,7 @@ static ATTRIBUTE_NORETURN void SendServerRequest(BlazeServer* server) { fprintf(stderr, "Server's cwd moved or deleted (%s).\n", server_cwd.c_str()); } - server->KillRunningServer(globals->server_pid); + server->KillRunningServer(); } else { break; } @@ -1777,6 +1859,7 @@ int main(int argc, const char *argv[]) { EnsureFiniteStackLimit(); ExtractData(self_path); + blaze_server->Connect(); EnsureCorrectRunningVersion(blaze_server); KillRunningServerIfDifferentStartupOptions(blaze_server); @@ -1795,9 +1878,12 @@ static void null_grpc_log_function(gpr_log_func_args *args) { GrpcBlazeServer::GrpcBlazeServer() { gpr_set_log_function(null_grpc_log_function); + connected_ = false; } bool GrpcBlazeServer::Connect() { + assert(!connected_); + std::string server_dir = globals->options.output_base + "/server"; std::string port; std::string ipv4_prefix = "127.0.0.1:"; @@ -1842,7 +1928,15 @@ bool GrpcBlazeServer::Connect() { return false; } + globals->server_pid = GetServerPid(server_dir); + if (globals->server_pid <= 0) { + pdie(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR, + "can't get PID of existing server (server dir=%s)", + server_dir.c_str()); + } + this->client_ = std::move(client); + connected_ = true; return true; } @@ -1901,7 +1995,11 @@ void GrpcBlazeServer::CancelThread() { } } -void GrpcBlazeServer::KillRunningServer(int server_pid) { +// This will wait indefinitely until the server shuts down +void GrpcBlazeServer::KillRunningServer() { + assert(connected_); + assert(globals->server_pid > 0); + grpc::ClientContext context; command_server::RunRequest request; command_server::RunResponse response; @@ -1915,11 +2013,15 @@ void GrpcBlazeServer::KillRunningServer(int server_pid) { while (reader->Read(&response)) {} - // Send a SIGKILL for good measure - killpg(server_pid, SIGKILL); + // Kill the server process for good measure. + KillServerProcess(globals->server_pid, globals->options.output_base); + + connected_ = false; } unsigned int GrpcBlazeServer::Communicate() { + assert(connected_); + vector arg_vector; string command = globals->option_processor.GetCommand(); if (command != "") { @@ -1945,7 +2047,7 @@ unsigned int GrpcBlazeServer::Communicate() { // Release the server lock because the gRPC handles concurrent clients just // fine. Note that this may result in two "waiting for other client" messages // (one during server startup and one emitted by the server) - blaze::ReleaseLock(&blaze_lock); + blaze::ReleaseLock(&blaze_lock_); { std::unique_lock lock(cancel_thread_mutex_); @@ -1996,12 +2098,17 @@ unsigned int GrpcBlazeServer::Communicate() { } void GrpcBlazeServer::Disconnect() { + assert(connected_); + client_.reset(); request_cookie_ = ""; response_cookie_ = ""; + connected_ = false; } void GrpcBlazeServer::Cancel() { + assert(connected_); + std::unique_lock lock(cancel_thread_mutex_); // Wake up the cancellation thread and tell it to issue its RPC cancel_thread_action_ = CANCEL; diff --git a/src/main/cpp/blaze_util_mingw.cc b/src/main/cpp/blaze_util_mingw.cc index bdb8362b2b..27d13fc4a9 100644 --- a/src/main/cpp/blaze_util_mingw.cc +++ b/src/main/cpp/blaze_util_mingw.cc @@ -288,15 +288,23 @@ static bool DaemonizeOnWindows() { return false; } -int ExecuteDaemon(const string& exe, const std::vector& args_vector, - const string& daemon_output, const string& server_dir) { +// Keeping an eye on the server process on Windows is not implemented yet. +// TODO(lberki): Implement this, because otherwise if we can't start up a server +// process, the client will hang until it times out. +class DummyBlazeServerStartup : public BlazeServerStartup { + public: + DummyBlazeServerStartup() {} + virtual ~DummyBlazeServerStartup() {} + bool IsStillAlive() override { return true; } +}; + +void ExecuteDaemon(const string& exe, const std::vector& args_vector, + const string& daemon_output, const string& server_dir, + BlazeServerStartup** server_startup) { if (DaemonizeOnWindows()) { // We are the client process - // TODO(lberki): -1 is only an in-band signal that tells that there is no - // way we can tell if the server process is still alive. Fix this, because - // otherwise if any of the calls below fails, the client will hang until - // it times out. - return -1; + *server_startup = new DummyBlazeServerStartup(); + return; } SECURITY_ATTRIBUTES sa; @@ -408,6 +416,7 @@ void ExecuteProgram( if (!success) { pdie(255, "Error %u executing: %s\n", GetLastError(), cmdline); } + // The output base lock is held while waiting WaitForSingleObject(processInfo.hProcess, INFINITE); DWORD exit_code; GetExitCodeProcess(processInfo.hProcess, &exit_code); @@ -619,4 +628,8 @@ bool CompareAbsolutePaths(const string& a, const string& b) { return a_real == b_real; } +void KillServerProcess(int pid, const string& output_base) { + // Not implemented yet. TerminateProcess should work. +} + } // namespace blaze diff --git a/src/main/cpp/blaze_util_platform.h b/src/main/cpp/blaze_util_platform.h index 4afcd9dfbc..09dda2eab7 100644 --- a/src/main/cpp/blaze_util_platform.h +++ b/src/main/cpp/blaze_util_platform.h @@ -59,14 +59,20 @@ std::string GetDefaultHostJavabase(); // This function does not return on success. void ExecuteProgram(const string& exe, const std::vector& args_vector); +class BlazeServerStartup { + public: + virtual ~BlazeServerStartup() {} + virtual bool IsStillAlive() = 0; +}; + // Starts a daemon process with its standard output and standard error -// redirected to the file "daemon_output". Returns a file descriptor of a named -// pipe whose other end is held by the daemon and which is closed if the daemon -// exits or -1 if such a pipe cannot be created.. The PID of the daemon just -// started is written into server_dir, both as a symlink (for legacy reasons) -// and as a file. -int ExecuteDaemon(const string& exe, const std::vector& args_vector, - const string& daemon_output, const string& server_dir); +// redirected to the file "daemon_output". Sets server_startup to an object +// that can be used to query if the server is still alive. The PID of the +// daemon started is written into server_dir, both as a symlink (for legacy +// reasons) and as a file. +void ExecuteDaemon(const string& exe, const std::vector& args_vector, + const string& daemon_output, const string& server_dir, + BlazeServerStartup** server_startup); // Executes a subprocess and returns its standard output and standard error. // If this fails, exits with the appropriate error code. @@ -111,6 +117,9 @@ uint64_t AcquireLock(const string& output_base, bool batch_mode, // usual. void ReleaseLock(BlazeLock* blaze_lock); +// Kills a server process based on its output base and PID. +void KillServerProcess(int pid, const string& output_base); + } // namespace blaze #endif // BAZEL_SRC_MAIN_CPP_BLAZE_UTIL_PLATFORM_H_ diff --git a/src/main/cpp/blaze_util_posix.cc b/src/main/cpp/blaze_util_posix.cc index 11a74febb7..5c28bd372b 100644 --- a/src/main/cpp/blaze_util_posix.cc +++ b/src/main/cpp/blaze_util_posix.cc @@ -99,8 +99,36 @@ static void Daemonize(const string& daemon_output) { dup(STDOUT_FILENO); // stderr (2>&1) } -int ExecuteDaemon(const string& exe, const std::vector& args_vector, - const string& daemon_output, const string& server_dir) { +class PipeBlazeServerStartup : public BlazeServerStartup { + public: + PipeBlazeServerStartup(int pipe_fd); + virtual ~PipeBlazeServerStartup(); + bool IsStillAlive() override; + + private: + int pipe_fd; +}; + +PipeBlazeServerStartup::PipeBlazeServerStartup(int pipe_fd) { + this->pipe_fd = pipe_fd; + if (fcntl(pipe_fd, F_SETFL, O_NONBLOCK | fcntl(pipe_fd, F_GETFL))) { + pdie(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR, + "Failed: fcntl to enable O_NONBLOCK on pipe"); + } +} + +PipeBlazeServerStartup::~PipeBlazeServerStartup() { + close(pipe_fd); +} + +bool PipeBlazeServerStartup::IsStillAlive() { + char c; + return read(this->pipe_fd, &c, 1) == -1 && errno == EAGAIN; +} + +void ExecuteDaemon(const string& exe, const std::vector& args_vector, + const string& daemon_output, const string& server_dir, + BlazeServerStartup** server_startup) { int fds[2]; if (pipe(fds)) { pdie(blaze_exit_code::INTERNAL_ERROR, "pipe creation failed"); @@ -110,7 +138,8 @@ int ExecuteDaemon(const string& exe, const std::vector& args_vector, pdie(blaze_exit_code::INTERNAL_ERROR, "fork() failed"); } else if (child > 0) { // we're the parent close(fds[1]); // parent keeps only the reading side - return fds[0]; + *server_startup = new PipeBlazeServerStartup(fds[0]); + return; } else { close(fds[0]); // child keeps only the writing side } @@ -180,4 +209,10 @@ bool CompareAbsolutePaths(const string& a, const string& b) { return a == b; } +void KillServerProcess(int pid, const string& output_base) { + // TODO(lberki): This might accidentally kill an unrelated process if the + // server died and the PID got reused. + killpg(pid, SIGKILL); +} + } // namespace blaze. diff --git a/src/main/protobuf/command_server.proto b/src/main/protobuf/command_server.proto index 69fc62ee1c..20f408a98f 100644 --- a/src/main/protobuf/command_server.proto +++ b/src/main/protobuf/command_server.proto @@ -12,9 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. // -// This file contains the protocol buffer representation of a build -// file or 'blaze query --output=proto' call. - +// This file contains the protocol used to communicate between the Bazel client +// and the server. syntax = "proto3"; package command_server; -- cgit v1.2.3