From 2732df0202499aff75de38c012fec57ed53a5a30 Mon Sep 17 00:00:00 2001 From: olaola Date: Fri, 16 Mar 2018 08:48:13 -0700 Subject: Propagating remote results, including stdout/err, to Bazel on execution timeouts. The refactoring to have an Exception that contains partial results will also be used in the next CL, in order to propagate and save remote server logs. RELNOTES: None PiperOrigin-RevId: 189344465 --- .../build/lib/remote/ExecutionStatusException.java | 61 +++++++++++++++ .../build/lib/remote/GrpcRemoteExecutor.java | 16 ++-- .../build/lib/remote/RemoteRetrierUtils.java | 5 ++ .../build/lib/remote/RemoteSpawnRunner.java | 36 +++++---- .../build/lib/remote/TimeoutException.java | 22 ------ .../build/lib/remote/RemoteSpawnRunnerTest.java | 31 +++++++- src/test/shell/bazel/remote_execution_test.sh | 10 ++- .../build/remote/worker/ExecutionServer.java | 11 ++- .../build/remote/worker/WatcherServer.java | 91 ++++++++++++---------- 9 files changed, 188 insertions(+), 95 deletions(-) create mode 100644 src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java delete mode 100644 src/main/java/com/google/devtools/build/lib/remote/TimeoutException.java (limited to 'src') diff --git a/src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java b/src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java new file mode 100644 index 0000000000..63839a7ae8 --- /dev/null +++ b/src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java @@ -0,0 +1,61 @@ +// Copyright 2017 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package com.google.devtools.build.lib.remote; + +import com.google.devtools.remoteexecution.v1test.ExecuteResponse; +import com.google.rpc.Status; +import io.grpc.Status.Code; +import io.grpc.StatusRuntimeException; +import javax.annotation.Nullable; + +/** + * Exception to signal that a remote execution has failed with a certain status received from the + * server, and other details, such as the action result and the server logs. The exception may be + * retriable or not, depending on the status/details. + */ +public class ExecutionStatusException extends StatusRuntimeException { + private final Status status; + private final ExecuteResponse response; + + public ExecutionStatusException(Status status, @Nullable ExecuteResponse response) { + super(convertStatus(status, response)); + this.status = status; + this.response = response; + } + + private static io.grpc.Status convertStatus(Status status, @Nullable ExecuteResponse response) { + io.grpc.Status result = + io.grpc.Status.fromCodeValue( + // Hack: convert to non-retriable exception on timeouts. + isExecutionTimeout(status, response) + ? Code.FAILED_PRECONDITION.value() + : status.getCode()); + return result.withDescription(status.getMessage()); + } + + private static boolean isExecutionTimeout(Status status, @Nullable ExecuteResponse response) { + return response != null + && response.getStatus().equals(status) + && status.getCode() == Code.DEADLINE_EXCEEDED.value(); + } + + public boolean isExecutionTimeout() { + return isExecutionTimeout(status, response); + } + + @Nullable + public ExecuteResponse getResponse() { + return response; + } +} diff --git a/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java b/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java index 3b91174a06..fb8cf0d9c4 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java +++ b/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java @@ -73,29 +73,23 @@ class GrpcRemoteExecutor { .withCallCredentials(callCredentials); } - private void handleStatus(Status statusProto) throws IOException { - StatusRuntimeException e = StatusProto.toStatusRuntimeException(statusProto); - if (e.getStatus().getCode() == Code.OK) { + private void handleStatus(Status statusProto, @Nullable ExecuteResponse resp) throws IOException { + if (statusProto.getCode() == Code.OK.value()) { return; } - if (e.getStatus().getCode() == Code.DEADLINE_EXCEEDED) { - // This was caused by the command itself exceeding the timeout, - // therefore it is not retriable. - throw new TimeoutException(); - } - throw e; + throw new ExecutionStatusException(statusProto, resp); } private @Nullable ExecuteResponse getOperationResponse(Operation op) throws IOException { if (op.getResultCase() == Operation.ResultCase.ERROR) { - handleStatus(op.getError()); + handleStatus(op.getError(), null); } if (op.getDone()) { Preconditions.checkState(op.getResultCase() != Operation.ResultCase.RESULT_NOT_SET); try { ExecuteResponse resp = op.getResponse().unpack(ExecuteResponse.class); if (resp.hasStatus()) { - handleStatus(resp.getStatus()); + handleStatus(resp.getStatus(), resp); } return resp; } catch (InvalidProtocolBufferException e) { diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java index f518b43682..70c73fbcdf 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java @@ -30,4 +30,9 @@ public final class RemoteRetrierUtils { } return false; } + + public static boolean causedByExecTimeout(RetryException e) { + return (e.getCause() instanceof ExecutionStatusException + && ((ExecutionStatusException) e.getCause()).isExecutionTimeout()); + } } diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java index 1b7a0809d6..289fe8b4a9 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java @@ -55,8 +55,6 @@ import com.google.protobuf.TextFormat.ParseException; import io.grpc.Context; import io.grpc.Status.Code; import java.io.IOException; -import java.io.OutputStream; -import java.nio.charset.StandardCharsets; import java.time.Duration; import java.util.ArrayList; import java.util.Collection; @@ -239,27 +237,33 @@ class RemoteSpawnRunner implements SpawnRunner { if (Thread.currentThread().isInterrupted()) { throw new InterruptedException(); } - if (options.remoteLocalFallback && !(cause instanceof TimeoutException)) { + if (options.remoteLocalFallback + && !(cause instanceof RetryException + && RemoteRetrierUtils.causedByExecTimeout((RetryException) cause))) { return execLocally(spawn, policy, inputMap, uploadLocalResults, remoteCache, actionKey); } return handleError(cause, policy.getFileOutErr()); } - private SpawnResult handleError(IOException exception, FileOutErr outErr) throws IOException, - ExecException { + private SpawnResult handleError(IOException exception, FileOutErr outErr) + throws ExecException, InterruptedException, IOException { final Throwable cause = exception.getCause(); - if (exception instanceof TimeoutException || cause instanceof TimeoutException) { - // TODO(buchgr): provide stdout/stderr from the action that timed out. - // Remove the unsuported message once remote execution tests no longer check for it. - try (OutputStream out = outErr.getOutputStream()) { - String msg = "Log output for timeouts is not yet supported in remote execution.\n"; - out.write(msg.getBytes(StandardCharsets.UTF_8)); + if (cause instanceof ExecutionStatusException) { + ExecutionStatusException e = (ExecutionStatusException) cause; + if (e.getResponse() != null) { + ExecuteResponse resp = e.getResponse(); + if (resp.hasResult()) { + // We try to download all (partial) results even on server error, for debuggability. + remoteCache.download(resp.getResult(), execRoot, outErr); + } + } + if (e.isExecutionTimeout()) { + return new SpawnResult.Builder() + .setRunnerName(getName()) + .setStatus(Status.TIMEOUT) + .setExitCode(POSIX_TIMEOUT_EXIT_CODE) + .build(); } - return new SpawnResult.Builder() - .setRunnerName(getName()) - .setStatus(Status.TIMEOUT) - .setExitCode(POSIX_TIMEOUT_EXIT_CODE) - .build(); } final Status status; if (exception instanceof RetryException diff --git a/src/main/java/com/google/devtools/build/lib/remote/TimeoutException.java b/src/main/java/com/google/devtools/build/lib/remote/TimeoutException.java deleted file mode 100644 index f1ba33a8dc..0000000000 --- a/src/main/java/com/google/devtools/build/lib/remote/TimeoutException.java +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2017 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package com.google.devtools.build.lib.remote; - -import java.io.IOException; - -/** - * Exception to signal that a remote execution has timed out. - */ -class TimeoutException extends IOException { -} diff --git a/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java b/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java index 150e759718..98b56706d7 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java @@ -63,6 +63,7 @@ import com.google.devtools.common.options.Options; import com.google.devtools.remoteexecution.v1test.ActionResult; import com.google.devtools.remoteexecution.v1test.ExecuteRequest; import com.google.devtools.remoteexecution.v1test.ExecuteResponse; +import com.google.rpc.Code; import java.io.IOException; import java.time.Duration; import java.util.Collection; @@ -536,7 +537,18 @@ public class RemoteSpawnRunnerTest { ActionResult cachedResult = ActionResult.newBuilder().setExitCode(0).build(); when(cache.getCachedActionResult(any(ActionKey.class))).thenReturn(null); - when(executor.executeRemotely(any(ExecuteRequest.class))).thenThrow(new TimeoutException()); + ExecuteResponse resp = + ExecuteResponse.newBuilder() + .setResult(cachedResult) + .setStatus( + com.google.rpc.Status.newBuilder() + .setCode(Code.DEADLINE_EXCEEDED.getNumber()) + .build()) + .build(); + when(executor.executeRemotely(any(ExecuteRequest.class))) + .thenThrow( + new Retrier.RetryException( + "", 1, new ExecutionStatusException(resp.getStatus(), resp))); Spawn spawn = newSimpleSpawn(); @@ -546,7 +558,7 @@ public class RemoteSpawnRunnerTest { assertThat(res.status()).isEqualTo(Status.TIMEOUT); verify(executor).executeRemotely(any(ExecuteRequest.class)); - verify(cache, never()).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class)); + verify(cache).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class)); } @Test @@ -572,7 +584,18 @@ public class RemoteSpawnRunnerTest { ActionResult cachedResult = ActionResult.newBuilder().setExitCode(0).build(); when(cache.getCachedActionResult(any(ActionKey.class))).thenReturn(null); - when(executor.executeRemotely(any(ExecuteRequest.class))).thenThrow(new TimeoutException()); + ExecuteResponse resp = + ExecuteResponse.newBuilder() + .setResult(cachedResult) + .setStatus( + com.google.rpc.Status.newBuilder() + .setCode(Code.DEADLINE_EXCEEDED.getNumber()) + .build()) + .build(); + when(executor.executeRemotely(any(ExecuteRequest.class))) + .thenThrow( + new Retrier.RetryException( + "", 1, new ExecutionStatusException(resp.getStatus(), resp))); Spawn spawn = newSimpleSpawn(); @@ -582,7 +605,7 @@ public class RemoteSpawnRunnerTest { assertThat(res.status()).isEqualTo(Status.TIMEOUT); verify(executor).executeRemotely(any(ExecuteRequest.class)); - verify(cache, never()).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class)); + verify(cache).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class)); verify(localRunner, never()).exec(eq(spawn), eq(policy)); } diff --git a/src/test/shell/bazel/remote_execution_test.sh b/src/test/shell/bazel/remote_execution_test.sh index dff25048a1..2b800b3d54 100755 --- a/src/test/shell/bazel/remote_execution_test.sh +++ b/src/test/shell/bazel/remote_execution_test.sh @@ -418,7 +418,11 @@ EOF cat > a/sleep.sh <<'EOF' #!/bin/sh -sleep 2 +for i in {1..3} +do + echo "Sleeping $i..." + sleep 1 +done EOF chmod +x a/sleep.sh bazel test \ @@ -429,6 +433,10 @@ EOF //a:sleep >& $TEST_log \ && fail "Test failure (timeout) expected" || true expect_log "TIMEOUT" + expect_log "Sleeping 1..." + # The current implementation of the remote worker does not terminate actions + # when they time out, therefore we cannot verify that: + # expect_not_log "Sleeping 3..." } function test_passed_env_user() { diff --git a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java index 6202d3deda..098b4adf4b 100644 --- a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java +++ b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java @@ -25,6 +25,7 @@ import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.devtools.build.lib.remote.CacheNotFoundException; +import com.google.devtools.build.lib.remote.ExecutionStatusException; import com.google.devtools.build.lib.remote.SimpleBlobStoreActionCache; import com.google.devtools.build.lib.remote.util.DigestUtil; import com.google.devtools.build.lib.remote.util.DigestUtil.ActionKey; @@ -40,6 +41,7 @@ import com.google.devtools.remoteexecution.v1test.Action; import com.google.devtools.remoteexecution.v1test.ActionResult; import com.google.devtools.remoteexecution.v1test.Command.EnvironmentVariable; import com.google.devtools.remoteexecution.v1test.ExecuteRequest; +import com.google.devtools.remoteexecution.v1test.ExecuteResponse; import com.google.devtools.remoteexecution.v1test.ExecutionGrpc.ExecutionImplBase; import com.google.devtools.remoteexecution.v1test.Platform; import com.google.devtools.remoteexecution.v1test.RequestMetadata; @@ -49,7 +51,6 @@ import com.google.rpc.Code; import com.google.rpc.Status; import io.grpc.Context; import io.grpc.StatusException; -import io.grpc.protobuf.StatusProto; import io.grpc.stub.StreamObserver; import java.io.ByteArrayOutputStream; import java.io.File; @@ -251,17 +252,19 @@ final class ExecutionServer extends ExecutionImplBase { (cmdResult != null && cmdResult.getTerminationStatus().timedOut()) || wasTimeout(timeoutMillis, System.currentTimeMillis() - startTime); final int exitCode; + ExecuteResponse.Builder resp = ExecuteResponse.newBuilder(); if (wasTimeout) { final String errMessage = String.format( "Command:\n%s\nexceeded deadline of %f seconds.", Arrays.toString(command.getArgumentsList().toArray()), timeoutMillis / 1000.0); logger.warning(errMessage); - throw StatusProto.toStatusException( + resp.setStatus( Status.newBuilder() .setCode(Code.DEADLINE_EXCEEDED.getNumber()) .setMessage(errMessage) .build()); + exitCode = LOCAL_EXEC_ERROR; } else if (cmdResult == null) { exitCode = LOCAL_EXEC_ERROR; } else { @@ -278,6 +281,10 @@ final class ExecutionServer extends ExecutionImplBase { ActionKey actionKey = digestUtil.computeActionKey(action); cache.setCachedActionResult(actionKey, finalResult); } + resp.setResult(finalResult); + if (wasTimeout) { + throw new ExecutionStatusException(resp.getStatus(), resp.build()); + } return finalResult; } diff --git a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java index 26797dc30c..7d3db46f56 100644 --- a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java +++ b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java @@ -17,6 +17,7 @@ package com.google.devtools.build.remote.worker; import com.google.common.base.Throwables; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.MoreExecutors; +import com.google.devtools.build.lib.remote.ExecutionStatusException; import com.google.devtools.remoteexecution.v1test.ActionResult; import com.google.devtools.remoteexecution.v1test.ExecuteResponse; import com.google.longrunning.Operation; @@ -58,45 +59,57 @@ final class WatcherServer extends WatcherImplBase { return; } - future.addListener(() -> { - try { - try { - ActionResult result = future.get(); - responseObserver.onNext( - packExists( - Operation.newBuilder() - .setName(opName) - .setDone(true) - .setResponse( - Any.pack(ExecuteResponse.newBuilder().setResult(result).build())))); - responseObserver.onCompleted(); - } catch (ExecutionException e) { - Throwables.throwIfUnchecked(e.getCause()); - throw (Exception) e.getCause(); - } - } catch (Exception e) { - logger.log(Level.SEVERE, "Work failed: " + opName, e); - responseObserver.onNext( - ChangeBatch.newBuilder() - .addChanges( - Change.newBuilder() - .setState(Change.State.EXISTS) - .setData( - Any.pack( - Operation.newBuilder() - .setName(opName) - .setError(StatusUtils.internalErrorStatus(e)) - .build())) - .build()) - .build()); - responseObserver.onCompleted(); - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - } finally { - operationsCache.remove(opName); - } - }, MoreExecutors.directExecutor()); + future.addListener( + () -> { + try { + try { + ActionResult result = future.get(); + responseObserver.onNext( + packExists( + Operation.newBuilder() + .setName(opName) + .setDone(true) + .setResponse( + Any.pack(ExecuteResponse.newBuilder().setResult(result).build())))); + responseObserver.onCompleted(); + } catch (ExecutionException e) { + Throwables.throwIfUnchecked(e.getCause()); + throw (Exception) e.getCause(); + } + } catch (Exception e) { + ExecuteResponse resp; + if (e instanceof ExecutionStatusException) { + resp = ((ExecutionStatusException) e).getResponse(); + } else { + logger.log(Level.SEVERE, "Work failed: " + opName, e); + resp = + ExecuteResponse.newBuilder() + .setStatus(StatusUtils.internalErrorStatus(e)) + .build(); + } + responseObserver.onNext( + ChangeBatch.newBuilder() + .addChanges( + Change.newBuilder() + .setState(Change.State.EXISTS) + .setData( + Any.pack( + Operation.newBuilder() + .setName(opName) + .setDone(true) + .setResponse(Any.pack(resp)) + .build())) + .build()) + .build()); + responseObserver.onCompleted(); + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + } finally { + operationsCache.remove(opName); + } + }, + MoreExecutors.directExecutor()); } /** Constructs a ChangeBatch with an exists state change that contains the given operation. */ -- cgit v1.2.3