Skip to content

Commit

Permalink
Automatically retry the build if encountered remote cache eviction error
Browse files Browse the repository at this point in the history
  • Loading branch information
coeuvre committed Mar 10, 2023
1 parent f9008f6 commit 1a43dcb
Show file tree
Hide file tree
Showing 8 changed files with 215 additions and 68 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@
import com.google.devtools.build.lib.analysis.test.TestProvider;
import com.google.devtools.build.lib.bugreport.BugReporter;
import com.google.devtools.build.lib.buildtool.buildevent.ExecutionProgressReceiverAvailableEvent;
import com.google.devtools.build.lib.events.Event;
import com.google.devtools.build.lib.events.Reporter;
import com.google.devtools.build.lib.exec.ExecutionOptions;
import com.google.devtools.build.lib.profiler.Profiler;
import com.google.devtools.build.lib.profiler.SilentCloseable;
import com.google.devtools.build.lib.runtime.KeepGoingOption;
Expand All @@ -44,6 +46,7 @@
import com.google.devtools.build.lib.util.AbruptExitException;
import com.google.devtools.build.lib.util.DetailedExitCode;
import com.google.devtools.build.lib.util.DetailedExitCode.DetailedExitCodeComparator;
import com.google.devtools.build.lib.util.ExitCode;
import com.google.devtools.build.lib.vfs.ModifiedFileSet;
import com.google.devtools.build.skyframe.EvaluationResult;
import com.google.devtools.common.options.OptionsProvider;
Expand Down Expand Up @@ -100,12 +103,6 @@ public void buildArtifacts(
TopLevelArtifactContext topLevelArtifactContext,
boolean trustRemoteArtifacts)
throws BuildFailedException, AbruptExitException, TestExecException, InterruptedException {
BuildRequestOptions buildRequestOptions = options.getOptions(BuildRequestOptions.class);
// TODO(bazel-team): Should use --experimental_fsvc_threads instead of the hardcoded constant
// but plumbing the flag through is hard.
int fsvcThreads = buildRequestOptions == null ? 200 : buildRequestOptions.fsvcThreads;
skyframeExecutor.detectModifiedOutputFiles(
modifiedOutputFiles, lastExecutionTimeRange, trustRemoteArtifacts, fsvcThreads);
try (SilentCloseable c = Profiler.instance().profile("configureActionExecutor")) {
skyframeExecutor.configureActionExecutor(fileCache, actionInputPrefetcher);
}
Expand All @@ -119,9 +116,6 @@ public void buildArtifacts(
.getEventBus()
.post(new ExecutionProgressReceiverAvailableEvent(executionProgressReceiver));

List<DetailedExitCode> detailedExitCodes = new ArrayList<>();
EvaluationResult<?> result;

ActionExecutionStatusReporter statusReporter = ActionExecutionStatusReporter.create(
reporter, skyframeExecutor.getEventBus());

Expand All @@ -141,70 +135,126 @@ public void buildArtifacts(
parallelTests = Sets.difference(parallelTests, targetsToSkip);
exclusiveTests = Sets.difference(exclusiveTests, targetsToSkip);

var remoteCacheEvictionRetries =
options.getOptions(ExecutionOptions.class).remoteCacheEvictionRetries;
try {
result =
skyframeExecutor.buildArtifacts(
while (true) {
try {
buildArtifactsOnce(
reporter,
resourceManager,
executor,
artifacts,
targetsToBuild,
aspects,
parallelTests,
exclusiveTests,
targetsToBuild,
aspects,
executor,
options,
actionCacheChecker,
lastExecutionTimeRange,
topLevelArtifactContext,
trustRemoteArtifacts,
executionProgressReceiver,
isBuildingExclusiveArtifacts);
break;
} catch (BuildFailedException e) {
if (e.getDetailedExitCode().getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) {
if (remoteCacheEvictionRetries > 0) {
--remoteCacheEvictionRetries;
reporter.handle(
Event.warn("Found remote cache eviction error, retrying the build..."));
continue;
}
}
throw e;
}
}
} finally {
watchdog.stop();
skyframeExecutor.setActionExecutionProgressReportingObjects(null, null, null);
statusReporter.unregisterFromEventBus();
}
}

private void buildArtifactsOnce(
Reporter reporter,
Set<Artifact> artifacts,
Set<ConfiguredTarget> parallelTests,
Set<ConfiguredTarget> exclusiveTests,
Set<ConfiguredTarget> targetsToBuild,
ImmutableSet<AspectKey> aspects,
Executor executor,
OptionsProvider options,
@Nullable Range<Long> lastExecutionTimeRange,
TopLevelArtifactContext topLevelArtifactContext,
boolean trustRemoteArtifacts,
ExecutionProgressReceiver executionProgressReceiver,
AtomicBoolean isBuildingExclusiveArtifacts)
throws BuildFailedException, AbruptExitException, TestExecException, InterruptedException {
BuildRequestOptions buildRequestOptions = options.getOptions(BuildRequestOptions.class);
// TODO(bazel-team): Should use --experimental_fsvc_threads instead of the hardcoded constant
// but plumbing the flag through is hard.
int fsvcThreads = buildRequestOptions == null ? 200 : buildRequestOptions.fsvcThreads;
skyframeExecutor.detectModifiedOutputFiles(
modifiedOutputFiles, lastExecutionTimeRange, trustRemoteArtifacts, fsvcThreads);

List<DetailedExitCode> detailedExitCodes = new ArrayList<>();
EvaluationResult<?> result =
skyframeExecutor.buildArtifacts(
reporter,
resourceManager,
executor,
artifacts,
targetsToBuild,
aspects,
parallelTests,
exclusiveTests,
options,
actionCacheChecker,
executionProgressReceiver,
topLevelArtifactContext);
// progressReceiver is finished, so unsynchronized access to builtTargets is now safe.
DetailedExitCode detailedExitCode =
SkyframeErrorProcessor.processResult(
reporter,
result,
options.getOptions(KeepGoingOption.class).keepGoing,
skyframeExecutor.getCyclesReporter(),
bugReporter);

if (detailedExitCode != null) {
detailedExitCodes.add(detailedExitCode);
}

// Run exclusive tests: either tagged as "exclusive" or is run in an invocation with
// --test_output=streamed.
isBuildingExclusiveArtifacts.set(true);
for (ConfiguredTarget exclusiveTest : exclusiveTests) {
// Since only one artifact is being built at a time, we don't worry about an artifact being
// built and then the build being interrupted.
result =
skyframeExecutor.runExclusiveTest(
reporter,
resourceManager,
executor,
exclusiveTest,
options,
actionCacheChecker,
topLevelArtifactContext);
// progressReceiver is finished, so unsynchronized access to builtTargets is now safe.
DetailedExitCode detailedExitCode =
detailedExitCode =
SkyframeErrorProcessor.processResult(
reporter,
result,
options.getOptions(KeepGoingOption.class).keepGoing,
skyframeExecutor.getCyclesReporter(),
bugReporter);
Preconditions.checkState(
detailedExitCode != null || !result.keyNames().isEmpty(),
"Build reported as successful but test %s not executed: %s",
exclusiveTest,
result);

if (detailedExitCode != null) {
detailedExitCodes.add(detailedExitCode);
}

// Run exclusive tests: either tagged as "exclusive" or is run in an invocation with
// --test_output=streamed.
isBuildingExclusiveArtifacts.set(true);
for (ConfiguredTarget exclusiveTest : exclusiveTests) {
// Since only one artifact is being built at a time, we don't worry about an artifact being
// built and then the build being interrupted.
result =
skyframeExecutor.runExclusiveTest(
reporter,
resourceManager,
executor,
exclusiveTest,
options,
actionCacheChecker,
topLevelArtifactContext);
detailedExitCode =
SkyframeErrorProcessor.processResult(
reporter,
result,
options.getOptions(KeepGoingOption.class).keepGoing,
skyframeExecutor.getCyclesReporter(),
bugReporter);
Preconditions.checkState(
detailedExitCode != null || !result.keyNames().isEmpty(),
"Build reported as successful but test %s not executed: %s",
exclusiveTest,
result);

if (detailedExitCode != null) {
detailedExitCodes.add(detailedExitCode);
}
}
} finally {
watchdog.stop();
skyframeExecutor.setActionExecutionProgressReportingObjects(null, null, null);
statusReporter.unregisterFromEventBus();
}

if (detailedExitCodes.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,16 @@ public boolean usingLocalTestJobs() {
+ "test log. Otherwise, Bazel generates a test.xml as part of the test action.")
public boolean splitXmlGeneration;

@Option(
name = "experimental_remote_cache_eviction_retries",
defaultValue = "0",
documentationCategory = OptionDocumentationCategory.REMOTE,
effectTags = {OptionEffectTag.EXECUTION},
help =
"The maximum number of attempts to retry if the build encountered remote cache eviction error.")
public int remoteCacheEvictionRetries;


/** An enum for specifying different formats of test output. */
public enum TestOutputFormat {
SUMMARY, // Provide summary output only.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,9 @@ public void flushOutputTree() throws InterruptedException {
downloadCache.awaitInProgressTasks();
}

public ImmutableSet<ActionInput> getMissingActionInputs() {
return ImmutableSet.copyOf(missingActionInputs);
public ImmutableSet<ActionInput> takeMissingActionInputs() {
var result = ImmutableSet.copyOf(missingActionInputs);
missingActionInputs.removeAll(result);
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,7 @@ protected Completable onErrorResumeNext(Throwable error) {
new EnvironmentalExecException(
(BulkTransferException) error,
FailureDetail.newBuilder()
.setMessage(
"Failed to fetch blobs because they do not exist remotely."
+ " Build without the Bytes does not work if your remote"
+ " cache evicts blobs during builds")
.setMessage("Failed to fetch blobs because they do not exist remotely")
.setSpawn(FailureDetails.Spawn.newBuilder().setCode(code))
.build());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@
import com.google.common.collect.ImmutableMap;
import com.google.common.eventbus.Subscribe;
import com.google.devtools.build.lib.actions.Action;
import com.google.devtools.build.lib.actions.ActionCompletionEvent;
import com.google.devtools.build.lib.actions.ActionInputMap;
import com.google.devtools.build.lib.actions.Artifact;
import com.google.devtools.build.lib.actions.ArtifactPathResolver;
import com.google.devtools.build.lib.actions.FilesetOutputSymlink;
import com.google.devtools.build.lib.actions.cache.MetadataHandler;
import com.google.devtools.build.lib.actions.cache.MetadataInjector;
import com.google.devtools.build.lib.buildtool.buildevent.ExecutionPhaseCompleteEvent;
import com.google.devtools.build.lib.events.EventHandler;
import com.google.devtools.build.lib.util.AbruptExitException;
import com.google.devtools.build.lib.vfs.BatchStat;
Expand Down Expand Up @@ -115,9 +115,9 @@ public void finalizeBuild(boolean buildSuccessful) {
}

@Subscribe
public void onExecutionPhaseCompleteEvent(ExecutionPhaseCompleteEvent event) {
public void onActionCompletion(ActionCompletionEvent event) {
if (leaseService != null && actionInputFetcher != null) {
leaseService.handleMissingInputs(actionInputFetcher.getMissingActionInputs());
leaseService.handleMissingInputs(actionInputFetcher.takeMissingActionInputs());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
import com.google.devtools.build.lib.util.AnsiStrippingOutputStream;
import com.google.devtools.build.lib.util.DebugLoggerConfigurator;
import com.google.devtools.build.lib.util.DetailedExitCode;
import com.google.devtools.build.lib.util.ExitCode;
import com.google.devtools.build.lib.util.InterruptedFailureDetails;
import com.google.devtools.build.lib.util.LoggingUtil;
import com.google.devtools.build.lib.util.Pair;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ public void missingInputs_addedToList() {
assertThrows(
Exception.class, () -> wait(prefetcher.prefetchFiles(metadata.keySet(), metadataProvider)));

assertThat(prefetcher.getMissingActionInputs()).contains(a);
assertThat(prefetcher.takeMissingActionInputs()).contains(a);
}

protected static void wait(ListenableFuture<Void> future)
Expand Down
Loading

0 comments on commit 1a43dcb

Please sign in to comment.