diff options
Diffstat (limited to 'src')
9 files changed, 188 insertions, 95 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java b/src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java new file mode 100644 index 0000000000..63839a7ae8 --- /dev/null +++ b/src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java @@ -0,0 +1,61 @@ +// Copyright 2017 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package com.google.devtools.build.lib.remote; + +import com.google.devtools.remoteexecution.v1test.ExecuteResponse; +import com.google.rpc.Status; +import io.grpc.Status.Code; +import io.grpc.StatusRuntimeException; +import javax.annotation.Nullable; + +/** + * Exception to signal that a remote execution has failed with a certain status received from the + * server, and other details, such as the action result and the server logs. The exception may be + * retriable or not, depending on the status/details. + */ +public class ExecutionStatusException extends StatusRuntimeException { + private final Status status; + private final ExecuteResponse response; + + public ExecutionStatusException(Status status, @Nullable ExecuteResponse response) { + super(convertStatus(status, response)); + this.status = status; + this.response = response; + } + + private static io.grpc.Status convertStatus(Status status, @Nullable ExecuteResponse response) { + io.grpc.Status result = + io.grpc.Status.fromCodeValue( + // Hack: convert to non-retriable exception on timeouts. + isExecutionTimeout(status, response) + ? Code.FAILED_PRECONDITION.value() + : status.getCode()); + return result.withDescription(status.getMessage()); + } + + private static boolean isExecutionTimeout(Status status, @Nullable ExecuteResponse response) { + return response != null + && response.getStatus().equals(status) + && status.getCode() == Code.DEADLINE_EXCEEDED.value(); + } + + public boolean isExecutionTimeout() { + return isExecutionTimeout(status, response); + } + + @Nullable + public ExecuteResponse getResponse() { + return response; + } +} diff --git a/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java b/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java index 3b91174a06..fb8cf0d9c4 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java +++ b/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java @@ -73,29 +73,23 @@ class GrpcRemoteExecutor { .withCallCredentials(callCredentials); } - private void handleStatus(Status statusProto) throws IOException { - StatusRuntimeException e = StatusProto.toStatusRuntimeException(statusProto); - if (e.getStatus().getCode() == Code.OK) { + private void handleStatus(Status statusProto, @Nullable ExecuteResponse resp) throws IOException { + if (statusProto.getCode() == Code.OK.value()) { return; } - if (e.getStatus().getCode() == Code.DEADLINE_EXCEEDED) { - // This was caused by the command itself exceeding the timeout, - // therefore it is not retriable. - throw new TimeoutException(); - } - throw e; + throw new ExecutionStatusException(statusProto, resp); } private @Nullable ExecuteResponse getOperationResponse(Operation op) throws IOException { if (op.getResultCase() == Operation.ResultCase.ERROR) { - handleStatus(op.getError()); + handleStatus(op.getError(), null); } if (op.getDone()) { Preconditions.checkState(op.getResultCase() != Operation.ResultCase.RESULT_NOT_SET); try { ExecuteResponse resp = op.getResponse().unpack(ExecuteResponse.class); if (resp.hasStatus()) { - handleStatus(resp.getStatus()); + handleStatus(resp.getStatus(), resp); } return resp; } catch (InvalidProtocolBufferException e) { diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java index f518b43682..70c73fbcdf 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java @@ -30,4 +30,9 @@ public final class RemoteRetrierUtils { } return false; } + + public static boolean causedByExecTimeout(RetryException e) { + return (e.getCause() instanceof ExecutionStatusException + && ((ExecutionStatusException) e.getCause()).isExecutionTimeout()); + } } diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java index 1b7a0809d6..289fe8b4a9 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java @@ -55,8 +55,6 @@ import com.google.protobuf.TextFormat.ParseException; import io.grpc.Context; import io.grpc.Status.Code; import java.io.IOException; -import java.io.OutputStream; -import java.nio.charset.StandardCharsets; import java.time.Duration; import java.util.ArrayList; import java.util.Collection; @@ -239,27 +237,33 @@ class RemoteSpawnRunner implements SpawnRunner { if (Thread.currentThread().isInterrupted()) { throw new InterruptedException(); } - if (options.remoteLocalFallback && !(cause instanceof TimeoutException)) { + if (options.remoteLocalFallback + && !(cause instanceof RetryException + && RemoteRetrierUtils.causedByExecTimeout((RetryException) cause))) { return execLocally(spawn, policy, inputMap, uploadLocalResults, remoteCache, actionKey); } return handleError(cause, policy.getFileOutErr()); } - private SpawnResult handleError(IOException exception, FileOutErr outErr) throws IOException, - ExecException { + private SpawnResult handleError(IOException exception, FileOutErr outErr) + throws ExecException, InterruptedException, IOException { final Throwable cause = exception.getCause(); - if (exception instanceof TimeoutException || cause instanceof TimeoutException) { - // TODO(buchgr): provide stdout/stderr from the action that timed out. - // Remove the unsuported message once remote execution tests no longer check for it. - try (OutputStream out = outErr.getOutputStream()) { - String msg = "Log output for timeouts is not yet supported in remote execution.\n"; - out.write(msg.getBytes(StandardCharsets.UTF_8)); + if (cause instanceof ExecutionStatusException) { + ExecutionStatusException e = (ExecutionStatusException) cause; + if (e.getResponse() != null) { + ExecuteResponse resp = e.getResponse(); + if (resp.hasResult()) { + // We try to download all (partial) results even on server error, for debuggability. + remoteCache.download(resp.getResult(), execRoot, outErr); + } + } + if (e.isExecutionTimeout()) { + return new SpawnResult.Builder() + .setRunnerName(getName()) + .setStatus(Status.TIMEOUT) + .setExitCode(POSIX_TIMEOUT_EXIT_CODE) + .build(); } - return new SpawnResult.Builder() - .setRunnerName(getName()) - .setStatus(Status.TIMEOUT) - .setExitCode(POSIX_TIMEOUT_EXIT_CODE) - .build(); } final Status status; if (exception instanceof RetryException diff --git a/src/main/java/com/google/devtools/build/lib/remote/TimeoutException.java b/src/main/java/com/google/devtools/build/lib/remote/TimeoutException.java deleted file mode 100644 index f1ba33a8dc..0000000000 --- a/src/main/java/com/google/devtools/build/lib/remote/TimeoutException.java +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2017 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package com.google.devtools.build.lib.remote; - -import java.io.IOException; - -/** - * Exception to signal that a remote execution has timed out. - */ -class TimeoutException extends IOException { -} diff --git a/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java b/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java index 150e759718..98b56706d7 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java @@ -63,6 +63,7 @@ import com.google.devtools.common.options.Options; import com.google.devtools.remoteexecution.v1test.ActionResult; import com.google.devtools.remoteexecution.v1test.ExecuteRequest; import com.google.devtools.remoteexecution.v1test.ExecuteResponse; +import com.google.rpc.Code; import java.io.IOException; import java.time.Duration; import java.util.Collection; @@ -536,7 +537,18 @@ public class RemoteSpawnRunnerTest { ActionResult cachedResult = ActionResult.newBuilder().setExitCode(0).build(); when(cache.getCachedActionResult(any(ActionKey.class))).thenReturn(null); - when(executor.executeRemotely(any(ExecuteRequest.class))).thenThrow(new TimeoutException()); + ExecuteResponse resp = + ExecuteResponse.newBuilder() + .setResult(cachedResult) + .setStatus( + com.google.rpc.Status.newBuilder() + .setCode(Code.DEADLINE_EXCEEDED.getNumber()) + .build()) + .build(); + when(executor.executeRemotely(any(ExecuteRequest.class))) + .thenThrow( + new Retrier.RetryException( + "", 1, new ExecutionStatusException(resp.getStatus(), resp))); Spawn spawn = newSimpleSpawn(); @@ -546,7 +558,7 @@ public class RemoteSpawnRunnerTest { assertThat(res.status()).isEqualTo(Status.TIMEOUT); verify(executor).executeRemotely(any(ExecuteRequest.class)); - verify(cache, never()).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class)); + verify(cache).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class)); } @Test @@ -572,7 +584,18 @@ public class RemoteSpawnRunnerTest { ActionResult cachedResult = ActionResult.newBuilder().setExitCode(0).build(); when(cache.getCachedActionResult(any(ActionKey.class))).thenReturn(null); - when(executor.executeRemotely(any(ExecuteRequest.class))).thenThrow(new TimeoutException()); + ExecuteResponse resp = + ExecuteResponse.newBuilder() + .setResult(cachedResult) + .setStatus( + com.google.rpc.Status.newBuilder() + .setCode(Code.DEADLINE_EXCEEDED.getNumber()) + .build()) + .build(); + when(executor.executeRemotely(any(ExecuteRequest.class))) + .thenThrow( + new Retrier.RetryException( + "", 1, new ExecutionStatusException(resp.getStatus(), resp))); Spawn spawn = newSimpleSpawn(); @@ -582,7 +605,7 @@ public class RemoteSpawnRunnerTest { assertThat(res.status()).isEqualTo(Status.TIMEOUT); verify(executor).executeRemotely(any(ExecuteRequest.class)); - verify(cache, never()).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class)); + verify(cache).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class)); verify(localRunner, never()).exec(eq(spawn), eq(policy)); } diff --git a/src/test/shell/bazel/remote_execution_test.sh b/src/test/shell/bazel/remote_execution_test.sh index dff25048a1..2b800b3d54 100755 --- a/src/test/shell/bazel/remote_execution_test.sh +++ b/src/test/shell/bazel/remote_execution_test.sh @@ -418,7 +418,11 @@ EOF cat > a/sleep.sh <<'EOF' #!/bin/sh -sleep 2 +for i in {1..3} +do + echo "Sleeping $i..." + sleep 1 +done EOF chmod +x a/sleep.sh bazel test \ @@ -429,6 +433,10 @@ EOF //a:sleep >& $TEST_log \ && fail "Test failure (timeout) expected" || true expect_log "TIMEOUT" + expect_log "Sleeping 1..." + # The current implementation of the remote worker does not terminate actions + # when they time out, therefore we cannot verify that: + # expect_not_log "Sleeping 3..." } function test_passed_env_user() { diff --git a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java index 6202d3deda..098b4adf4b 100644 --- a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java +++ b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java @@ -25,6 +25,7 @@ import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.devtools.build.lib.remote.CacheNotFoundException; +import com.google.devtools.build.lib.remote.ExecutionStatusException; import com.google.devtools.build.lib.remote.SimpleBlobStoreActionCache; import com.google.devtools.build.lib.remote.util.DigestUtil; import com.google.devtools.build.lib.remote.util.DigestUtil.ActionKey; @@ -40,6 +41,7 @@ import com.google.devtools.remoteexecution.v1test.Action; import com.google.devtools.remoteexecution.v1test.ActionResult; import com.google.devtools.remoteexecution.v1test.Command.EnvironmentVariable; import com.google.devtools.remoteexecution.v1test.ExecuteRequest; +import com.google.devtools.remoteexecution.v1test.ExecuteResponse; import com.google.devtools.remoteexecution.v1test.ExecutionGrpc.ExecutionImplBase; import com.google.devtools.remoteexecution.v1test.Platform; import com.google.devtools.remoteexecution.v1test.RequestMetadata; @@ -49,7 +51,6 @@ import com.google.rpc.Code; import com.google.rpc.Status; import io.grpc.Context; import io.grpc.StatusException; -import io.grpc.protobuf.StatusProto; import io.grpc.stub.StreamObserver; import java.io.ByteArrayOutputStream; import java.io.File; @@ -251,17 +252,19 @@ final class ExecutionServer extends ExecutionImplBase { (cmdResult != null && cmdResult.getTerminationStatus().timedOut()) || wasTimeout(timeoutMillis, System.currentTimeMillis() - startTime); final int exitCode; + ExecuteResponse.Builder resp = ExecuteResponse.newBuilder(); if (wasTimeout) { final String errMessage = String.format( "Command:\n%s\nexceeded deadline of %f seconds.", Arrays.toString(command.getArgumentsList().toArray()), timeoutMillis / 1000.0); logger.warning(errMessage); - throw StatusProto.toStatusException( + resp.setStatus( Status.newBuilder() .setCode(Code.DEADLINE_EXCEEDED.getNumber()) .setMessage(errMessage) .build()); + exitCode = LOCAL_EXEC_ERROR; } else if (cmdResult == null) { exitCode = LOCAL_EXEC_ERROR; } else { @@ -278,6 +281,10 @@ final class ExecutionServer extends ExecutionImplBase { ActionKey actionKey = digestUtil.computeActionKey(action); cache.setCachedActionResult(actionKey, finalResult); } + resp.setResult(finalResult); + if (wasTimeout) { + throw new ExecutionStatusException(resp.getStatus(), resp.build()); + } return finalResult; } diff --git a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java index 26797dc30c..7d3db46f56 100644 --- a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java +++ b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java @@ -17,6 +17,7 @@ package com.google.devtools.build.remote.worker; import com.google.common.base.Throwables; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.MoreExecutors; +import com.google.devtools.build.lib.remote.ExecutionStatusException; import com.google.devtools.remoteexecution.v1test.ActionResult; import com.google.devtools.remoteexecution.v1test.ExecuteResponse; import com.google.longrunning.Operation; @@ -58,45 +59,57 @@ final class WatcherServer extends WatcherImplBase { return; } - future.addListener(() -> { - try { - try { - ActionResult result = future.get(); - responseObserver.onNext( - packExists( - Operation.newBuilder() - .setName(opName) - .setDone(true) - .setResponse( - Any.pack(ExecuteResponse.newBuilder().setResult(result).build())))); - responseObserver.onCompleted(); - } catch (ExecutionException e) { - Throwables.throwIfUnchecked(e.getCause()); - throw (Exception) e.getCause(); - } - } catch (Exception e) { - logger.log(Level.SEVERE, "Work failed: " + opName, e); - responseObserver.onNext( - ChangeBatch.newBuilder() - .addChanges( - Change.newBuilder() - .setState(Change.State.EXISTS) - .setData( - Any.pack( - Operation.newBuilder() - .setName(opName) - .setError(StatusUtils.internalErrorStatus(e)) - .build())) - .build()) - .build()); - responseObserver.onCompleted(); - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - } finally { - operationsCache.remove(opName); - } - }, MoreExecutors.directExecutor()); + future.addListener( + () -> { + try { + try { + ActionResult result = future.get(); + responseObserver.onNext( + packExists( + Operation.newBuilder() + .setName(opName) + .setDone(true) + .setResponse( + Any.pack(ExecuteResponse.newBuilder().setResult(result).build())))); + responseObserver.onCompleted(); + } catch (ExecutionException e) { + Throwables.throwIfUnchecked(e.getCause()); + throw (Exception) e.getCause(); + } + } catch (Exception e) { + ExecuteResponse resp; + if (e instanceof ExecutionStatusException) { + resp = ((ExecutionStatusException) e).getResponse(); + } else { + logger.log(Level.SEVERE, "Work failed: " + opName, e); + resp = + ExecuteResponse.newBuilder() + .setStatus(StatusUtils.internalErrorStatus(e)) + .build(); + } + responseObserver.onNext( + ChangeBatch.newBuilder() + .addChanges( + Change.newBuilder() + .setState(Change.State.EXISTS) + .setData( + Any.pack( + Operation.newBuilder() + .setName(opName) + .setDone(true) + .setResponse(Any.pack(resp)) + .build())) + .build()) + .build()); + responseObserver.onCompleted(); + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + } finally { + operationsCache.remove(opName); + } + }, + MoreExecutors.directExecutor()); } /** Constructs a ChangeBatch with an exists state change that contains the given operation. */ |