From 1a43dcbde98ffc03e20d6005090f5346b4b2ed3e Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Thu, 9 Mar 2023 17:43:52 +0100 Subject: [PATCH] Automatically retry the build if encountered remote cache eviction error --- .../build/lib/buildtool/SkyframeBuilder.java | 160 ++++++++++++------ .../build/lib/exec/ExecutionOptions.java | 10 ++ .../remote/AbstractActionInputPrefetcher.java | 6 +- .../lib/remote/RemoteActionInputFetcher.java | 5 +- .../build/lib/remote/RemoteOutputService.java | 6 +- .../lib/runtime/BlazeCommandDispatcher.java | 1 + .../remote/ActionInputPrefetcherTestBase.java | 2 +- .../BuildWithoutTheBytesIntegrationTest.java | 93 +++++++++- 8 files changed, 215 insertions(+), 68 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/buildtool/SkyframeBuilder.java b/src/main/java/com/google/devtools/build/lib/buildtool/SkyframeBuilder.java index 0fb45dc13848e5..a80ae0e2c795e1 100644 --- a/src/main/java/com/google/devtools/build/lib/buildtool/SkyframeBuilder.java +++ b/src/main/java/com/google/devtools/build/lib/buildtool/SkyframeBuilder.java @@ -32,7 +32,9 @@ import com.google.devtools.build.lib.analysis.test.TestProvider; import com.google.devtools.build.lib.bugreport.BugReporter; import com.google.devtools.build.lib.buildtool.buildevent.ExecutionProgressReceiverAvailableEvent; +import com.google.devtools.build.lib.events.Event; import com.google.devtools.build.lib.events.Reporter; +import com.google.devtools.build.lib.exec.ExecutionOptions; import com.google.devtools.build.lib.profiler.Profiler; import com.google.devtools.build.lib.profiler.SilentCloseable; import com.google.devtools.build.lib.runtime.KeepGoingOption; @@ -44,6 +46,7 @@ import com.google.devtools.build.lib.util.AbruptExitException; import com.google.devtools.build.lib.util.DetailedExitCode; import com.google.devtools.build.lib.util.DetailedExitCode.DetailedExitCodeComparator; +import com.google.devtools.build.lib.util.ExitCode; import com.google.devtools.build.lib.vfs.ModifiedFileSet; import com.google.devtools.build.skyframe.EvaluationResult; import com.google.devtools.common.options.OptionsProvider; @@ -100,12 +103,6 @@ public void buildArtifacts( TopLevelArtifactContext topLevelArtifactContext, boolean trustRemoteArtifacts) throws BuildFailedException, AbruptExitException, TestExecException, InterruptedException { - BuildRequestOptions buildRequestOptions = options.getOptions(BuildRequestOptions.class); - // TODO(bazel-team): Should use --experimental_fsvc_threads instead of the hardcoded constant - // but plumbing the flag through is hard. - int fsvcThreads = buildRequestOptions == null ? 200 : buildRequestOptions.fsvcThreads; - skyframeExecutor.detectModifiedOutputFiles( - modifiedOutputFiles, lastExecutionTimeRange, trustRemoteArtifacts, fsvcThreads); try (SilentCloseable c = Profiler.instance().profile("configureActionExecutor")) { skyframeExecutor.configureActionExecutor(fileCache, actionInputPrefetcher); } @@ -119,9 +116,6 @@ public void buildArtifacts( .getEventBus() .post(new ExecutionProgressReceiverAvailableEvent(executionProgressReceiver)); - List detailedExitCodes = new ArrayList<>(); - EvaluationResult result; - ActionExecutionStatusReporter statusReporter = ActionExecutionStatusReporter.create( reporter, skyframeExecutor.getEventBus()); @@ -141,70 +135,126 @@ public void buildArtifacts( parallelTests = Sets.difference(parallelTests, targetsToSkip); exclusiveTests = Sets.difference(exclusiveTests, targetsToSkip); + var remoteCacheEvictionRetries = + options.getOptions(ExecutionOptions.class).remoteCacheEvictionRetries; try { - result = - skyframeExecutor.buildArtifacts( + while (true) { + try { + buildArtifactsOnce( reporter, - resourceManager, - executor, artifacts, - targetsToBuild, - aspects, parallelTests, exclusiveTests, + targetsToBuild, + aspects, + executor, options, - actionCacheChecker, + lastExecutionTimeRange, + topLevelArtifactContext, + trustRemoteArtifacts, executionProgressReceiver, + isBuildingExclusiveArtifacts); + break; + } catch (BuildFailedException e) { + if (e.getDetailedExitCode().getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) { + if (remoteCacheEvictionRetries > 0) { + --remoteCacheEvictionRetries; + reporter.handle( + Event.warn("Found remote cache eviction error, retrying the build...")); + continue; + } + } + throw e; + } + } + } finally { + watchdog.stop(); + skyframeExecutor.setActionExecutionProgressReportingObjects(null, null, null); + statusReporter.unregisterFromEventBus(); + } + } + + private void buildArtifactsOnce( + Reporter reporter, + Set artifacts, + Set parallelTests, + Set exclusiveTests, + Set targetsToBuild, + ImmutableSet aspects, + Executor executor, + OptionsProvider options, + @Nullable Range lastExecutionTimeRange, + TopLevelArtifactContext topLevelArtifactContext, + boolean trustRemoteArtifacts, + ExecutionProgressReceiver executionProgressReceiver, + AtomicBoolean isBuildingExclusiveArtifacts) + throws BuildFailedException, AbruptExitException, TestExecException, InterruptedException { + BuildRequestOptions buildRequestOptions = options.getOptions(BuildRequestOptions.class); + // TODO(bazel-team): Should use --experimental_fsvc_threads instead of the hardcoded constant + // but plumbing the flag through is hard. + int fsvcThreads = buildRequestOptions == null ? 200 : buildRequestOptions.fsvcThreads; + skyframeExecutor.detectModifiedOutputFiles( + modifiedOutputFiles, lastExecutionTimeRange, trustRemoteArtifacts, fsvcThreads); + + List detailedExitCodes = new ArrayList<>(); + EvaluationResult result = + skyframeExecutor.buildArtifacts( + reporter, + resourceManager, + executor, + artifacts, + targetsToBuild, + aspects, + parallelTests, + exclusiveTests, + options, + actionCacheChecker, + executionProgressReceiver, + topLevelArtifactContext); + // progressReceiver is finished, so unsynchronized access to builtTargets is now safe. + DetailedExitCode detailedExitCode = + SkyframeErrorProcessor.processResult( + reporter, + result, + options.getOptions(KeepGoingOption.class).keepGoing, + skyframeExecutor.getCyclesReporter(), + bugReporter); + + if (detailedExitCode != null) { + detailedExitCodes.add(detailedExitCode); + } + + // Run exclusive tests: either tagged as "exclusive" or is run in an invocation with + // --test_output=streamed. + isBuildingExclusiveArtifacts.set(true); + for (ConfiguredTarget exclusiveTest : exclusiveTests) { + // Since only one artifact is being built at a time, we don't worry about an artifact being + // built and then the build being interrupted. + result = + skyframeExecutor.runExclusiveTest( + reporter, + resourceManager, + executor, + exclusiveTest, + options, + actionCacheChecker, topLevelArtifactContext); - // progressReceiver is finished, so unsynchronized access to builtTargets is now safe. - DetailedExitCode detailedExitCode = + detailedExitCode = SkyframeErrorProcessor.processResult( reporter, result, options.getOptions(KeepGoingOption.class).keepGoing, skyframeExecutor.getCyclesReporter(), bugReporter); + Preconditions.checkState( + detailedExitCode != null || !result.keyNames().isEmpty(), + "Build reported as successful but test %s not executed: %s", + exclusiveTest, + result); if (detailedExitCode != null) { detailedExitCodes.add(detailedExitCode); } - - // Run exclusive tests: either tagged as "exclusive" or is run in an invocation with - // --test_output=streamed. - isBuildingExclusiveArtifacts.set(true); - for (ConfiguredTarget exclusiveTest : exclusiveTests) { - // Since only one artifact is being built at a time, we don't worry about an artifact being - // built and then the build being interrupted. - result = - skyframeExecutor.runExclusiveTest( - reporter, - resourceManager, - executor, - exclusiveTest, - options, - actionCacheChecker, - topLevelArtifactContext); - detailedExitCode = - SkyframeErrorProcessor.processResult( - reporter, - result, - options.getOptions(KeepGoingOption.class).keepGoing, - skyframeExecutor.getCyclesReporter(), - bugReporter); - Preconditions.checkState( - detailedExitCode != null || !result.keyNames().isEmpty(), - "Build reported as successful but test %s not executed: %s", - exclusiveTest, - result); - - if (detailedExitCode != null) { - detailedExitCodes.add(detailedExitCode); - } - } - } finally { - watchdog.stop(); - skyframeExecutor.setActionExecutionProgressReportingObjects(null, null, null); - statusReporter.unregisterFromEventBus(); } if (detailedExitCodes.isEmpty()) { diff --git a/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java b/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java index 21a0204bfbfdac..0fae3b7d6e2cfb 100644 --- a/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java +++ b/src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java @@ -495,6 +495,16 @@ public boolean usingLocalTestJobs() { + "test log. Otherwise, Bazel generates a test.xml as part of the test action.") public boolean splitXmlGeneration; + @Option( + name = "experimental_remote_cache_eviction_retries", + defaultValue = "0", + documentationCategory = OptionDocumentationCategory.REMOTE, + effectTags = {OptionEffectTag.EXECUTION}, + help = + "The maximum number of attempts to retry if the build encountered remote cache eviction error.") + public int remoteCacheEvictionRetries; + + /** An enum for specifying different formats of test output. */ public enum TestOutputFormat { SUMMARY, // Provide summary output only. diff --git a/src/main/java/com/google/devtools/build/lib/remote/AbstractActionInputPrefetcher.java b/src/main/java/com/google/devtools/build/lib/remote/AbstractActionInputPrefetcher.java index 78bbfd750abd34..502bfe2f1fd77e 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/AbstractActionInputPrefetcher.java +++ b/src/main/java/com/google/devtools/build/lib/remote/AbstractActionInputPrefetcher.java @@ -684,7 +684,9 @@ public void flushOutputTree() throws InterruptedException { downloadCache.awaitInProgressTasks(); } - public ImmutableSet getMissingActionInputs() { - return ImmutableSet.copyOf(missingActionInputs); + public ImmutableSet takeMissingActionInputs() { + var result = ImmutableSet.copyOf(missingActionInputs); + missingActionInputs.removeAll(result); + return result; } } diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java index 28b4ccbb618c77..38287b28743922 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteActionInputFetcher.java @@ -152,10 +152,7 @@ protected Completable onErrorResumeNext(Throwable error) { new EnvironmentalExecException( (BulkTransferException) error, FailureDetail.newBuilder() - .setMessage( - "Failed to fetch blobs because they do not exist remotely." - + " Build without the Bytes does not work if your remote" - + " cache evicts blobs during builds") + .setMessage("Failed to fetch blobs because they do not exist remotely") .setSpawn(FailureDetails.Spawn.newBuilder().setCode(code)) .build()); } diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputService.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputService.java index 2ea7a0f2aa9219..08c51f55c89625 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputService.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteOutputService.java @@ -20,13 +20,13 @@ import com.google.common.collect.ImmutableMap; import com.google.common.eventbus.Subscribe; import com.google.devtools.build.lib.actions.Action; +import com.google.devtools.build.lib.actions.ActionCompletionEvent; import com.google.devtools.build.lib.actions.ActionInputMap; import com.google.devtools.build.lib.actions.Artifact; import com.google.devtools.build.lib.actions.ArtifactPathResolver; import com.google.devtools.build.lib.actions.FilesetOutputSymlink; import com.google.devtools.build.lib.actions.cache.MetadataHandler; import com.google.devtools.build.lib.actions.cache.MetadataInjector; -import com.google.devtools.build.lib.buildtool.buildevent.ExecutionPhaseCompleteEvent; import com.google.devtools.build.lib.events.EventHandler; import com.google.devtools.build.lib.util.AbruptExitException; import com.google.devtools.build.lib.vfs.BatchStat; @@ -115,9 +115,9 @@ public void finalizeBuild(boolean buildSuccessful) { } @Subscribe - public void onExecutionPhaseCompleteEvent(ExecutionPhaseCompleteEvent event) { + public void onActionCompletion(ActionCompletionEvent event) { if (leaseService != null && actionInputFetcher != null) { - leaseService.handleMissingInputs(actionInputFetcher.getMissingActionInputs()); + leaseService.handleMissingInputs(actionInputFetcher.takeMissingActionInputs()); } } diff --git a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java index 0fbd5a2d3f0327..b7620d4b1fcf21 100644 --- a/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java +++ b/src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java @@ -54,6 +54,7 @@ import com.google.devtools.build.lib.util.AnsiStrippingOutputStream; import com.google.devtools.build.lib.util.DebugLoggerConfigurator; import com.google.devtools.build.lib.util.DetailedExitCode; +import com.google.devtools.build.lib.util.ExitCode; import com.google.devtools.build.lib.util.InterruptedFailureDetails; import com.google.devtools.build.lib.util.LoggingUtil; import com.google.devtools.build.lib.util.Pair; diff --git a/src/test/java/com/google/devtools/build/lib/remote/ActionInputPrefetcherTestBase.java b/src/test/java/com/google/devtools/build/lib/remote/ActionInputPrefetcherTestBase.java index c4c99508fb7b06..1033143fae1df1 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/ActionInputPrefetcherTestBase.java +++ b/src/test/java/com/google/devtools/build/lib/remote/ActionInputPrefetcherTestBase.java @@ -596,7 +596,7 @@ public void missingInputs_addedToList() { assertThrows( Exception.class, () -> wait(prefetcher.prefetchFiles(metadata.keySet(), metadataProvider))); - assertThat(prefetcher.getMissingActionInputs()).contains(a); + assertThat(prefetcher.takeMissingActionInputs()).contains(a); } protected static void wait(ListenableFuture future) diff --git a/src/test/java/com/google/devtools/build/lib/remote/BuildWithoutTheBytesIntegrationTest.java b/src/test/java/com/google/devtools/build/lib/remote/BuildWithoutTheBytesIntegrationTest.java index c27756888f02ab..cc77892d3bc9f0 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/BuildWithoutTheBytesIntegrationTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/BuildWithoutTheBytesIntegrationTest.java @@ -474,9 +474,7 @@ public void remoteCacheEvictBlobs_whenPrefetchingInput_exitWithCode39() throws E // Assert: Exit code is 39 assertThat(error) .hasMessageThat() - .contains( - "Build without the Bytes does not work if your remote cache evicts blobs" - + " during builds"); + .contains("Failed to fetch blobs because they do not exist remotely"); assertThat(error).hasMessageThat().contains(String.format("%s/%s", hashCode, bytes.length)); assertThat(error.getDetailedExitCode().getExitCode().getNumericExitCode()).isEqualTo(39); } @@ -624,4 +622,93 @@ public void remoteCacheEvictBlobs_whenUploadingInputTree_incrementalBuildCanCont assertValidOutputFile( "a/bar.out", "file-inside\nupdated bar" + lineSeparator(), /* isLocal= */ true); } + + @Test + public void remoteCacheEvictBlobs_whenPrefetchingInput_retryTheBuild() throws Exception { + // Arrange: Prepare workspace and populate remote cache + write( + "a/BUILD", + "genrule(", + " name = 'foo',", + " srcs = ['foo.in'],", + " outs = ['foo.out'],", + " cmd = 'cat $(SRCS) > $@',", + ")", + "genrule(", + " name = 'bar',", + " srcs = ['foo.out', 'bar.in'],", + " outs = ['bar.out'],", + " cmd = 'cat $(SRCS) > $@',", + " tags = ['no-remote-exec'],", + ")"); + write("a/foo.in", "foo"); + write("a/bar.in", "bar"); + + // Populate remote cache + buildTarget("//a:bar"); + getOutputPath("a/foo.out").delete(); + getOutputPath("a/bar.out").delete(); + getOutputBase().getRelative("action_cache").deleteTreesBelow(); + restartServer(); + + // Clean build, foo.out isn't downloaded + buildTarget("//a:bar"); + assertOutputDoesNotExist("a/foo.out"); + + // Act: Evict blobs from remote cache and do an incremental build + evictAllBlobs(); + write("a/bar.in", "updated bar"); + addOptions("--experimental_remote_cache_eviction_retries=1"); + buildTarget("//a:bar"); + + // Assert: the build has been retried and output is correctly built. + events.assertContainsWarning("Found remote cache eviction error, retrying the build..."); + assertValidOutputFile("a/bar.out", "foo" + lineSeparator() + "updated bar" + lineSeparator()); + } + + @Test + public void remoteCacheEvictBlobs_whenUploadingInput_retryTheBuild() throws Exception { + // Arrange: Prepare workspace and populate remote cache + write( + "a/BUILD", + "genrule(", + " name = 'foo',", + " srcs = ['foo.in'],", + " outs = ['foo.out'],", + " cmd = 'cat $(SRCS) > $@',", + ")", + "genrule(", + " name = 'bar',", + " srcs = ['foo.out', 'bar.in'],", + " outs = ['bar.out'],", + " cmd = 'cat $(SRCS) > $@',", + ")"); + write("a/foo.in", "foo"); + write("a/bar.in", "bar"); + + // Populate remote cache + setDownloadAll(); + buildTarget("//a:bar"); + waitDownloads(); + getOutputPath("a/foo.out").delete(); + getOutputPath("a/bar.out").delete(); + getOutputBase().getRelative("action_cache").deleteTreesBelow(); + restartServer(); + + // Clean build, foo.out isn't downloaded + buildTarget("//a:bar"); + assertOutputDoesNotExist("a/foo.out"); + + // Act: Evict blobs from remote cache and do an incremental build + evictAllBlobs(); + write("a/bar.in", "updated bar"); + addOptions("--experimental_remote_cache_eviction_retries=1"); + setDownloadToplevel(); + buildTarget("//a:bar"); + waitDownloads(); + + // Assert: the build has been retried and output is correctly built. + events.assertContainsWarning("Found remote cache eviction error, retrying the build..."); + assertValidOutputFile("a/bar.out", "foo" + lineSeparator() + "updated bar" + lineSeparator()); + } }