aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar olaola <olaola@google.com>2018-03-16 08:48:13 -0700
committerGravatar Copybara-Service <copybara-piper@google.com>2018-03-16 08:50:15 -0700
commit2732df0202499aff75de38c012fec57ed53a5a30 (patch)
treecf5d5589c7da831f16af5b60d80c311e77814af4 /src
parent435fde772d284cdb5b73a88c3c02f7aa94a01ec1 (diff)
Propagating remote results, including stdout/err, to Bazel on execution timeouts.
The refactoring to have an Exception that contains partial results will also be used in the next CL, in order to propagate and save remote server logs. RELNOTES: None PiperOrigin-RevId: 189344465
Diffstat (limited to 'src')
-rw-r--r--src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java61
-rw-r--r--src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java16
-rw-r--r--src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java5
-rw-r--r--src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java36
-rw-r--r--src/main/java/com/google/devtools/build/lib/remote/TimeoutException.java22
-rw-r--r--src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java31
-rwxr-xr-xsrc/test/shell/bazel/remote_execution_test.sh10
-rw-r--r--src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java11
-rw-r--r--src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java91
9 files changed, 188 insertions, 95 deletions
diff --git a/src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java b/src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java
new file mode 100644
index 0000000000..63839a7ae8
--- /dev/null
+++ b/src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java
@@ -0,0 +1,61 @@
+// Copyright 2017 The Bazel Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package com.google.devtools.build.lib.remote;
+
+import com.google.devtools.remoteexecution.v1test.ExecuteResponse;
+import com.google.rpc.Status;
+import io.grpc.Status.Code;
+import io.grpc.StatusRuntimeException;
+import javax.annotation.Nullable;
+
+/**
+ * Exception to signal that a remote execution has failed with a certain status received from the
+ * server, and other details, such as the action result and the server logs. The exception may be
+ * retriable or not, depending on the status/details.
+ */
+public class ExecutionStatusException extends StatusRuntimeException {
+ private final Status status;
+ private final ExecuteResponse response;
+
+ public ExecutionStatusException(Status status, @Nullable ExecuteResponse response) {
+ super(convertStatus(status, response));
+ this.status = status;
+ this.response = response;
+ }
+
+ private static io.grpc.Status convertStatus(Status status, @Nullable ExecuteResponse response) {
+ io.grpc.Status result =
+ io.grpc.Status.fromCodeValue(
+ // Hack: convert to non-retriable exception on timeouts.
+ isExecutionTimeout(status, response)
+ ? Code.FAILED_PRECONDITION.value()
+ : status.getCode());
+ return result.withDescription(status.getMessage());
+ }
+
+ private static boolean isExecutionTimeout(Status status, @Nullable ExecuteResponse response) {
+ return response != null
+ && response.getStatus().equals(status)
+ && status.getCode() == Code.DEADLINE_EXCEEDED.value();
+ }
+
+ public boolean isExecutionTimeout() {
+ return isExecutionTimeout(status, response);
+ }
+
+ @Nullable
+ public ExecuteResponse getResponse() {
+ return response;
+ }
+}
diff --git a/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java b/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java
index 3b91174a06..fb8cf0d9c4 100644
--- a/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java
+++ b/src/main/java/com/google/devtools/build/lib/remote/GrpcRemoteExecutor.java
@@ -73,29 +73,23 @@ class GrpcRemoteExecutor {
.withCallCredentials(callCredentials);
}
- private void handleStatus(Status statusProto) throws IOException {
- StatusRuntimeException e = StatusProto.toStatusRuntimeException(statusProto);
- if (e.getStatus().getCode() == Code.OK) {
+ private void handleStatus(Status statusProto, @Nullable ExecuteResponse resp) throws IOException {
+ if (statusProto.getCode() == Code.OK.value()) {
return;
}
- if (e.getStatus().getCode() == Code.DEADLINE_EXCEEDED) {
- // This was caused by the command itself exceeding the timeout,
- // therefore it is not retriable.
- throw new TimeoutException();
- }
- throw e;
+ throw new ExecutionStatusException(statusProto, resp);
}
private @Nullable ExecuteResponse getOperationResponse(Operation op) throws IOException {
if (op.getResultCase() == Operation.ResultCase.ERROR) {
- handleStatus(op.getError());
+ handleStatus(op.getError(), null);
}
if (op.getDone()) {
Preconditions.checkState(op.getResultCase() != Operation.ResultCase.RESULT_NOT_SET);
try {
ExecuteResponse resp = op.getResponse().unpack(ExecuteResponse.class);
if (resp.hasStatus()) {
- handleStatus(resp.getStatus());
+ handleStatus(resp.getStatus(), resp);
}
return resp;
} catch (InvalidProtocolBufferException e) {
diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java
index f518b43682..70c73fbcdf 100644
--- a/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java
+++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrierUtils.java
@@ -30,4 +30,9 @@ public final class RemoteRetrierUtils {
}
return false;
}
+
+ public static boolean causedByExecTimeout(RetryException e) {
+ return (e.getCause() instanceof ExecutionStatusException
+ && ((ExecutionStatusException) e.getCause()).isExecutionTimeout());
+ }
}
diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java
index 1b7a0809d6..289fe8b4a9 100644
--- a/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java
+++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java
@@ -55,8 +55,6 @@ import com.google.protobuf.TextFormat.ParseException;
import io.grpc.Context;
import io.grpc.Status.Code;
import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
@@ -239,27 +237,33 @@ class RemoteSpawnRunner implements SpawnRunner {
if (Thread.currentThread().isInterrupted()) {
throw new InterruptedException();
}
- if (options.remoteLocalFallback && !(cause instanceof TimeoutException)) {
+ if (options.remoteLocalFallback
+ && !(cause instanceof RetryException
+ && RemoteRetrierUtils.causedByExecTimeout((RetryException) cause))) {
return execLocally(spawn, policy, inputMap, uploadLocalResults, remoteCache, actionKey);
}
return handleError(cause, policy.getFileOutErr());
}
- private SpawnResult handleError(IOException exception, FileOutErr outErr) throws IOException,
- ExecException {
+ private SpawnResult handleError(IOException exception, FileOutErr outErr)
+ throws ExecException, InterruptedException, IOException {
final Throwable cause = exception.getCause();
- if (exception instanceof TimeoutException || cause instanceof TimeoutException) {
- // TODO(buchgr): provide stdout/stderr from the action that timed out.
- // Remove the unsuported message once remote execution tests no longer check for it.
- try (OutputStream out = outErr.getOutputStream()) {
- String msg = "Log output for timeouts is not yet supported in remote execution.\n";
- out.write(msg.getBytes(StandardCharsets.UTF_8));
+ if (cause instanceof ExecutionStatusException) {
+ ExecutionStatusException e = (ExecutionStatusException) cause;
+ if (e.getResponse() != null) {
+ ExecuteResponse resp = e.getResponse();
+ if (resp.hasResult()) {
+ // We try to download all (partial) results even on server error, for debuggability.
+ remoteCache.download(resp.getResult(), execRoot, outErr);
+ }
+ }
+ if (e.isExecutionTimeout()) {
+ return new SpawnResult.Builder()
+ .setRunnerName(getName())
+ .setStatus(Status.TIMEOUT)
+ .setExitCode(POSIX_TIMEOUT_EXIT_CODE)
+ .build();
}
- return new SpawnResult.Builder()
- .setRunnerName(getName())
- .setStatus(Status.TIMEOUT)
- .setExitCode(POSIX_TIMEOUT_EXIT_CODE)
- .build();
}
final Status status;
if (exception instanceof RetryException
diff --git a/src/main/java/com/google/devtools/build/lib/remote/TimeoutException.java b/src/main/java/com/google/devtools/build/lib/remote/TimeoutException.java
deleted file mode 100644
index f1ba33a8dc..0000000000
--- a/src/main/java/com/google/devtools/build/lib/remote/TimeoutException.java
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2017 The Bazel Authors. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-package com.google.devtools.build.lib.remote;
-
-import java.io.IOException;
-
-/**
- * Exception to signal that a remote execution has timed out.
- */
-class TimeoutException extends IOException {
-}
diff --git a/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java b/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java
index 150e759718..98b56706d7 100644
--- a/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java
+++ b/src/test/java/com/google/devtools/build/lib/remote/RemoteSpawnRunnerTest.java
@@ -63,6 +63,7 @@ import com.google.devtools.common.options.Options;
import com.google.devtools.remoteexecution.v1test.ActionResult;
import com.google.devtools.remoteexecution.v1test.ExecuteRequest;
import com.google.devtools.remoteexecution.v1test.ExecuteResponse;
+import com.google.rpc.Code;
import java.io.IOException;
import java.time.Duration;
import java.util.Collection;
@@ -536,7 +537,18 @@ public class RemoteSpawnRunnerTest {
ActionResult cachedResult = ActionResult.newBuilder().setExitCode(0).build();
when(cache.getCachedActionResult(any(ActionKey.class))).thenReturn(null);
- when(executor.executeRemotely(any(ExecuteRequest.class))).thenThrow(new TimeoutException());
+ ExecuteResponse resp =
+ ExecuteResponse.newBuilder()
+ .setResult(cachedResult)
+ .setStatus(
+ com.google.rpc.Status.newBuilder()
+ .setCode(Code.DEADLINE_EXCEEDED.getNumber())
+ .build())
+ .build();
+ when(executor.executeRemotely(any(ExecuteRequest.class)))
+ .thenThrow(
+ new Retrier.RetryException(
+ "", 1, new ExecutionStatusException(resp.getStatus(), resp)));
Spawn spawn = newSimpleSpawn();
@@ -546,7 +558,7 @@ public class RemoteSpawnRunnerTest {
assertThat(res.status()).isEqualTo(Status.TIMEOUT);
verify(executor).executeRemotely(any(ExecuteRequest.class));
- verify(cache, never()).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class));
+ verify(cache).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class));
}
@Test
@@ -572,7 +584,18 @@ public class RemoteSpawnRunnerTest {
ActionResult cachedResult = ActionResult.newBuilder().setExitCode(0).build();
when(cache.getCachedActionResult(any(ActionKey.class))).thenReturn(null);
- when(executor.executeRemotely(any(ExecuteRequest.class))).thenThrow(new TimeoutException());
+ ExecuteResponse resp =
+ ExecuteResponse.newBuilder()
+ .setResult(cachedResult)
+ .setStatus(
+ com.google.rpc.Status.newBuilder()
+ .setCode(Code.DEADLINE_EXCEEDED.getNumber())
+ .build())
+ .build();
+ when(executor.executeRemotely(any(ExecuteRequest.class)))
+ .thenThrow(
+ new Retrier.RetryException(
+ "", 1, new ExecutionStatusException(resp.getStatus(), resp)));
Spawn spawn = newSimpleSpawn();
@@ -582,7 +605,7 @@ public class RemoteSpawnRunnerTest {
assertThat(res.status()).isEqualTo(Status.TIMEOUT);
verify(executor).executeRemotely(any(ExecuteRequest.class));
- verify(cache, never()).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class));
+ verify(cache).download(eq(cachedResult), eq(execRoot), any(FileOutErr.class));
verify(localRunner, never()).exec(eq(spawn), eq(policy));
}
diff --git a/src/test/shell/bazel/remote_execution_test.sh b/src/test/shell/bazel/remote_execution_test.sh
index dff25048a1..2b800b3d54 100755
--- a/src/test/shell/bazel/remote_execution_test.sh
+++ b/src/test/shell/bazel/remote_execution_test.sh
@@ -418,7 +418,11 @@ EOF
cat > a/sleep.sh <<'EOF'
#!/bin/sh
-sleep 2
+for i in {1..3}
+do
+ echo "Sleeping $i..."
+ sleep 1
+done
EOF
chmod +x a/sleep.sh
bazel test \
@@ -429,6 +433,10 @@ EOF
//a:sleep >& $TEST_log \
&& fail "Test failure (timeout) expected" || true
expect_log "TIMEOUT"
+ expect_log "Sleeping 1..."
+ # The current implementation of the remote worker does not terminate actions
+ # when they time out, therefore we cannot verify that:
+ # expect_not_log "Sleeping 3..."
}
function test_passed_env_user() {
diff --git a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java
index 6202d3deda..098b4adf4b 100644
--- a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java
+++ b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/ExecutionServer.java
@@ -25,6 +25,7 @@ import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.google.devtools.build.lib.remote.CacheNotFoundException;
+import com.google.devtools.build.lib.remote.ExecutionStatusException;
import com.google.devtools.build.lib.remote.SimpleBlobStoreActionCache;
import com.google.devtools.build.lib.remote.util.DigestUtil;
import com.google.devtools.build.lib.remote.util.DigestUtil.ActionKey;
@@ -40,6 +41,7 @@ import com.google.devtools.remoteexecution.v1test.Action;
import com.google.devtools.remoteexecution.v1test.ActionResult;
import com.google.devtools.remoteexecution.v1test.Command.EnvironmentVariable;
import com.google.devtools.remoteexecution.v1test.ExecuteRequest;
+import com.google.devtools.remoteexecution.v1test.ExecuteResponse;
import com.google.devtools.remoteexecution.v1test.ExecutionGrpc.ExecutionImplBase;
import com.google.devtools.remoteexecution.v1test.Platform;
import com.google.devtools.remoteexecution.v1test.RequestMetadata;
@@ -49,7 +51,6 @@ import com.google.rpc.Code;
import com.google.rpc.Status;
import io.grpc.Context;
import io.grpc.StatusException;
-import io.grpc.protobuf.StatusProto;
import io.grpc.stub.StreamObserver;
import java.io.ByteArrayOutputStream;
import java.io.File;
@@ -251,17 +252,19 @@ final class ExecutionServer extends ExecutionImplBase {
(cmdResult != null && cmdResult.getTerminationStatus().timedOut())
|| wasTimeout(timeoutMillis, System.currentTimeMillis() - startTime);
final int exitCode;
+ ExecuteResponse.Builder resp = ExecuteResponse.newBuilder();
if (wasTimeout) {
final String errMessage =
String.format(
"Command:\n%s\nexceeded deadline of %f seconds.",
Arrays.toString(command.getArgumentsList().toArray()), timeoutMillis / 1000.0);
logger.warning(errMessage);
- throw StatusProto.toStatusException(
+ resp.setStatus(
Status.newBuilder()
.setCode(Code.DEADLINE_EXCEEDED.getNumber())
.setMessage(errMessage)
.build());
+ exitCode = LOCAL_EXEC_ERROR;
} else if (cmdResult == null) {
exitCode = LOCAL_EXEC_ERROR;
} else {
@@ -278,6 +281,10 @@ final class ExecutionServer extends ExecutionImplBase {
ActionKey actionKey = digestUtil.computeActionKey(action);
cache.setCachedActionResult(actionKey, finalResult);
}
+ resp.setResult(finalResult);
+ if (wasTimeout) {
+ throw new ExecutionStatusException(resp.getStatus(), resp.build());
+ }
return finalResult;
}
diff --git a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java
index 26797dc30c..7d3db46f56 100644
--- a/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java
+++ b/src/tools/remote/src/main/java/com/google/devtools/build/remote/worker/WatcherServer.java
@@ -17,6 +17,7 @@ package com.google.devtools.build.remote.worker;
import com.google.common.base.Throwables;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.MoreExecutors;
+import com.google.devtools.build.lib.remote.ExecutionStatusException;
import com.google.devtools.remoteexecution.v1test.ActionResult;
import com.google.devtools.remoteexecution.v1test.ExecuteResponse;
import com.google.longrunning.Operation;
@@ -58,45 +59,57 @@ final class WatcherServer extends WatcherImplBase {
return;
}
- future.addListener(() -> {
- try {
- try {
- ActionResult result = future.get();
- responseObserver.onNext(
- packExists(
- Operation.newBuilder()
- .setName(opName)
- .setDone(true)
- .setResponse(
- Any.pack(ExecuteResponse.newBuilder().setResult(result).build()))));
- responseObserver.onCompleted();
- } catch (ExecutionException e) {
- Throwables.throwIfUnchecked(e.getCause());
- throw (Exception) e.getCause();
- }
- } catch (Exception e) {
- logger.log(Level.SEVERE, "Work failed: " + opName, e);
- responseObserver.onNext(
- ChangeBatch.newBuilder()
- .addChanges(
- Change.newBuilder()
- .setState(Change.State.EXISTS)
- .setData(
- Any.pack(
- Operation.newBuilder()
- .setName(opName)
- .setError(StatusUtils.internalErrorStatus(e))
- .build()))
- .build())
- .build());
- responseObserver.onCompleted();
- if (e instanceof InterruptedException) {
- Thread.currentThread().interrupt();
- }
- } finally {
- operationsCache.remove(opName);
- }
- }, MoreExecutors.directExecutor());
+ future.addListener(
+ () -> {
+ try {
+ try {
+ ActionResult result = future.get();
+ responseObserver.onNext(
+ packExists(
+ Operation.newBuilder()
+ .setName(opName)
+ .setDone(true)
+ .setResponse(
+ Any.pack(ExecuteResponse.newBuilder().setResult(result).build()))));
+ responseObserver.onCompleted();
+ } catch (ExecutionException e) {
+ Throwables.throwIfUnchecked(e.getCause());
+ throw (Exception) e.getCause();
+ }
+ } catch (Exception e) {
+ ExecuteResponse resp;
+ if (e instanceof ExecutionStatusException) {
+ resp = ((ExecutionStatusException) e).getResponse();
+ } else {
+ logger.log(Level.SEVERE, "Work failed: " + opName, e);
+ resp =
+ ExecuteResponse.newBuilder()
+ .setStatus(StatusUtils.internalErrorStatus(e))
+ .build();
+ }
+ responseObserver.onNext(
+ ChangeBatch.newBuilder()
+ .addChanges(
+ Change.newBuilder()
+ .setState(Change.State.EXISTS)
+ .setData(
+ Any.pack(
+ Operation.newBuilder()
+ .setName(opName)
+ .setDone(true)
+ .setResponse(Any.pack(resp))
+ .build()))
+ .build())
+ .build());
+ responseObserver.onCompleted();
+ if (e instanceof InterruptedException) {
+ Thread.currentThread().interrupt();
+ }
+ } finally {
+ operationsCache.remove(opName);
+ }
+ },
+ MoreExecutors.directExecutor());
}
/** Constructs a ChangeBatch with an exists state change that contains the given operation. */