Skip to content

Commit

Permalink
[FLINK-35831] Rotate jobId for both savepoint and stateless deploys
Browse files Browse the repository at this point in the history
  • Loading branch information
gyfora committed Jul 15, 2024
1 parent ffaa3dd commit 2b82a93
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ public void deploy(
statusRecorder.patchAndCacheStatus(relatedResource, ctx.getKubernetesClient());
}

setJobIdIfNecessary(spec, relatedResource, deployConfig, ctx.getKubernetesClient());
setJobIdIfNecessary(
relatedResource, deployConfig, ctx.getKubernetesClient(), requireHaMetadata);

eventRecorder.triggerEvent(
relatedResource,
Expand All @@ -193,10 +194,10 @@ public void deploy(
}

private void setJobIdIfNecessary(
FlinkDeploymentSpec spec,
FlinkDeployment resource,
Configuration deployConfig,
KubernetesClient client) {
KubernetesClient client,
boolean lastStateDeploy) {
// The jobId assigned by Flink would be constant,
// overwrite to avoid checkpoint path conflicts.
// https://issues.apache.org/jira/browse/FLINK-19358
Expand All @@ -208,9 +209,8 @@ private void setJobIdIfNecessary(
}

var status = resource.getStatus();
// generate jobId initially or rotate on every deployment when mode is stateless
if (status.getJobStatus().getJobId() == null
|| spec.getJob().getUpgradeMode() == UpgradeMode.STATELESS) {
// Rotate job id when not last-state deployment
if (status.getJobStatus().getJobId() == null || !lastStateDeploy) {
String jobId = JobID.generate().toHexString();
// record before first deployment to ensure we use it on any retry
status.getJobStatus().setJobId(jobId);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,15 @@ public void testUpgrade(FlinkVersion flinkVersion) throws Exception {
.isFirstDeployment());

JobID jobId = runningJobs.get(0).f1.getJobId();
verifyJobId(deployment, runningJobs.get(0).f1, runningJobs.get(0).f2, jobId);

// Last state upgrade
FlinkDeployment lastStateUpgrade = ReconciliationUtils.clone(deployment);
getJobSpec(lastStateUpgrade).setUpgradeMode(UpgradeMode.LAST_STATE);
lastStateUpgrade.getSpec().setRestartNonce(1234L);
reconciler.reconcile(deployment, context);
reconciler.reconcile(deployment, context);
// Make sure jobId is rotated on last-state startup
verifyJobId(lastStateUpgrade, runningJobs.get(0).f1, runningJobs.get(0).f2, jobId);

// Test stateless upgrade
FlinkDeployment statelessUpgrade = ReconciliationUtils.clone(deployment);
Expand Down Expand Up @@ -284,7 +292,10 @@ public void testUpgrade(FlinkVersion flinkVersion) throws Exception {
SnapshotTriggerType.UPGRADE,
getSavepointInfo(statefulUpgrade).getLastSavepoint().getTriggerType());
assertEquals(SnapshotStatus.SUCCEEDED, getLastSnapshotStatus(statefulUpgrade, SAVEPOINT));
verifyJobId(deployment, runningJobs.get(0).f1, runningJobs.get(0).f2, jobId);

// Make sure jobId rotated on savepoint
verifyNewJobId(runningJobs.get(0).f1, runningJobs.get(0).f2, jobId);
jobId = runningJobs.get(0).f1.getJobId();

getJobSpec(deployment).setUpgradeMode(UpgradeMode.LAST_STATE);
deployment.getSpec().setRestartNonce(100L);
Expand Down Expand Up @@ -325,7 +336,8 @@ public void testUpgrade(FlinkVersion flinkVersion) throws Exception {

assertEquals(1, flinkService.getRunningCount());
assertEquals("finished_sp", runningJobs.get(0).f0);
verifyJobId(deployment, runningJobs.get(0).f1, runningJobs.get(0).f2, jobId);
// Make sure jobId rotated on savepoint
verifyNewJobId(runningJobs.get(0).f1, runningJobs.get(0).f2, jobId);
}

private void verifyJobId(
Expand All @@ -335,6 +347,13 @@ private void verifyJobId(
assertEquals(conf.get(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID), jobId.toHexString());
}

private void verifyNewJobId(JobStatusMessage status, Configuration conf, JobID jobId) {
assertNotEquals(jobId.toHexString(), status.getJobId());
assertEquals(
conf.get(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID),
status.getJobId().toHexString());
}

@NotNull
private static Savepoint savepointFromSavepointInfo(
SavepointInfo savepointInfo, Long savepointTriggerNonce) {
Expand Down

0 comments on commit 2b82a93

Please sign in to comment.