-
Notifications
You must be signed in to change notification settings - Fork 3.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(controller): Use node.Name instead of node.DisplayName for onExit nodes #5486
Changes from 7 commits
1fdc86f
db33dc3
3eb964c
fe21678
882ebea
3db7f9b
bc9ad98
63dec32
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -301,6 +301,31 @@ func withAnnotation(key, val string) with { | |
return func(pod *apiv1.Pod) { pod.Annotations[key] = val } | ||
} | ||
|
||
// createRunningPods creates the pods that are marked as running in a given test so that they can be accessed by the | ||
// pod assessor | ||
func createRunningPods(ctx context.Context, woc *wfOperationCtx, with ...with) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Useful utility testing function to create pods in the client that are supposed to be running according to the test workflow status. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unused? delete? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only the |
||
podcs := woc.controller.kubeclientset.CoreV1().Pods(woc.wf.GetNamespace()) | ||
for _, node := range woc.wf.Status.Nodes { | ||
if node.Type == wfv1.NodeTypePod && node.Phase == wfv1.NodeRunning { | ||
pod, _ := podcs.Create(ctx, &apiv1.Pod{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: node.ID, | ||
Annotations: map[string]string{ | ||
"workflows.argoproj.io/node-name": node.Name, | ||
}, | ||
Labels: map[string]string{ | ||
"workflows.argoproj.io/workflow": woc.wf.Name, | ||
}, | ||
}, | ||
Status: apiv1.PodStatus{ | ||
Phase: apiv1.PodRunning, | ||
}, | ||
}, metav1.CreateOptions{}) | ||
_ = woc.controller.podInformer.GetStore().Add(pod) | ||
} | ||
} | ||
} | ||
|
||
// makePodsPhase acts like a pod controller and simulates the transition of pods transitioning into a specified state | ||
func makePodsPhase(ctx context.Context, woc *wfOperationCtx, phase apiv1.PodPhase, with ...with) { | ||
podcs := woc.controller.kubeclientset.CoreV1().Pods(woc.wf.GetNamespace()) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -695,7 +695,20 @@ func (d *dagContext) evaluateDependsLogic(taskName string) (bool, bool, error) { | |
} | ||
|
||
// If a task happens to have an onExit node, don't proceed until the onExit node is fulfilled | ||
if onExitNode := d.wf.GetNodeByName(common.GenerateOnExitNodeName(taskName)); onExitNode != nil { | ||
if onExitNode := d.wf.GetNodeByName(common.GenerateOnExitNodeName(depNode.Name)); onExitNode != nil { | ||
if !onExitNode.Fulfilled() { | ||
return false, false, nil | ||
} | ||
} | ||
|
||
// Previously we used `depNode.DisplayName` to generate all onExit node names. However, as these can be non-unique | ||
// we transitioned to using `depNode.Name` instead, which are guaranteed to be unique. In order to not disrupt | ||
// running workflows during upgrade time, we do an additional check to see if there is an onExit node with the old | ||
// name (`depNode.DisplayName`). | ||
// TODO: This scaffold code should be removed after a couple of "grace period" version upgrades to allow transitions. It was introduced in v3.0.0 | ||
// See more: https://github.com/argoproj/argo-workflows/issues/5502 | ||
legacyOnExitNodeName := common.GenerateOnExitNodeName(depNode.DisplayName) | ||
if onExitNode := d.wf.GetNodeByName(legacyOnExitNodeName); onExitNode != nil && d.wf.GetNodeByName(depNode.Name).HasChild(onExitNode.ID) { | ||
Comment on lines
+704
to
+711
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is tested by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice |
||
if !onExitNode.Fulfilled() { | ||
return false, false, nil | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2940,7 +2940,24 @@ func (woc *wfOperationCtx) createTemplateContext(scope wfv1.ResourceScope, resou | |
func (woc *wfOperationCtx) runOnExitNode(ctx context.Context, templateRef, parentDisplayName, parentNodeName, boundaryID string, tmplCtx *templateresolution.Context) (bool, *wfv1.NodeStatus, error) { | ||
if templateRef != "" && woc.GetShutdownStrategy().ShouldExecute(true) { | ||
woc.log.Infof("Running OnExit handler: %s", templateRef) | ||
onExitNodeName := common.GenerateOnExitNodeName(parentDisplayName) | ||
|
||
// Previously we used `parentDisplayName` to generate all onExit node names. However, as these can be non-unique | ||
// we transitioned to using `parentNodeName` instead, which are guaranteed to be unique. In order to not disrupt | ||
// running workflows during upgrade time, we first check if there is an onExit node that currently exists with the | ||
// legacy name AND said node is a child of the parent node. If it does, we continue execution with the legacy name. | ||
// If it doesn't, we use the new (and unique) name for all operations henceforth. | ||
// TODO: This scaffold code should be removed after a couple of "grace period" version upgrades to allow transitions. It was introduced in v3.0.0 | ||
// When the scaffold code is removed, we should only have the following: | ||
// | ||
// onExitNodeName := common.GenerateOnExitNodeName(parentNodeName) | ||
// | ||
// See more: https://github.com/argoproj/argo-workflows/issues/5502 | ||
onExitNodeName := common.GenerateOnExitNodeName(parentNodeName) | ||
legacyOnExitNodeName := common.GenerateOnExitNodeName(parentDisplayName) | ||
if legacyNameNode := woc.wf.GetNodeByName(legacyOnExitNodeName); legacyNameNode != nil && woc.wf.GetNodeByName(parentNodeName).HasChild(legacyNameNode.ID) { | ||
onExitNodeName = legacyOnExitNodeName | ||
} | ||
Comment on lines
+2944
to
+2959
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is tested by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the block of code looks to be duplicated - maybe create a utility func? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had a utility function before, but once all the parameters were taken into account it didn't really reduce complexity so I decided to keep it like so. |
||
|
||
onExitNode, err := woc.executeTemplate(ctx, onExitNodeName, &wfv1.WorkflowStep{Template: templateRef}, tmplCtx, woc.execWf.Spec.Arguments, &executeTemplateOpts{ | ||
boundaryID: boundaryID, | ||
onExitTemplate: true, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6374,3 +6374,224 @@ func TestRootRetryStrategyCompletes(t *testing.T) { | |
|
||
assert.Equal(t, wfv1.WorkflowSucceeded, woc.wf.Status.Phase) | ||
} | ||
|
||
const testOnExitNameBackwardsCompatibility = ` | ||
apiVersion: argoproj.io/v1alpha1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've already ensured these test workflows are minimal. |
||
kind: Workflow | ||
metadata: | ||
name: hello-world-69h5d | ||
spec: | ||
entrypoint: main | ||
templates: | ||
- name: main | ||
steps: | ||
- - name: run | ||
onExit: pass | ||
template: pass | ||
- container: | ||
args: | ||
- exit 0 | ||
command: | ||
- sh | ||
- -c | ||
image: alpine | ||
name: pass | ||
ttlStrategy: | ||
secondsAfterCompletion: 600 | ||
status: | ||
nodes: | ||
hello-world-69h5d: | ||
children: | ||
- hello-world-69h5d-4087924081 | ||
displayName: hello-world-69h5d | ||
finishedAt: "2021-03-24T14:53:32Z" | ||
id: hello-world-69h5d | ||
name: hello-world-69h5d | ||
outboundNodes: | ||
- hello-world-69h5d-928074325 | ||
phase: Running | ||
startedAt: "2021-03-24T14:53:18Z" | ||
templateName: main | ||
templateScope: local/hello-world-69h5d | ||
type: Steps | ||
hello-world-69h5d-928074325: | ||
boundaryID: hello-world-69h5d | ||
displayName: run.onExit | ||
finishedAt: "2021-03-24T14:53:31Z" | ||
hostNodeName: k3d-k3s-default-server-0 | ||
id: hello-world-69h5d-928074325 | ||
name: run.onExit | ||
phase: Running | ||
startedAt: "2021-03-24T14:53:25Z" | ||
templateName: pass | ||
templateScope: local/hello-world-69h5d | ||
type: Pod | ||
hello-world-69h5d-2500098386: | ||
boundaryID: hello-world-69h5d | ||
children: | ||
- hello-world-69h5d-928074325 | ||
displayName: run | ||
finishedAt: "2021-03-24T14:53:24Z" | ||
hostNodeName: k3d-k3s-default-server-0 | ||
id: hello-world-69h5d-2500098386 | ||
name: hello-world-69h5d[0].run | ||
phase: Succeeded | ||
startedAt: "2021-03-24T14:53:18Z" | ||
templateName: pass | ||
templateScope: local/hello-world-69h5d | ||
type: Pod | ||
hello-world-69h5d-4087924081: | ||
boundaryID: hello-world-69h5d | ||
children: | ||
- hello-world-69h5d-2500098386 | ||
displayName: '[0]' | ||
finishedAt: "2021-03-24T14:53:32Z" | ||
id: hello-world-69h5d-4087924081 | ||
name: hello-world-69h5d[0] | ||
phase: Running | ||
startedAt: "2021-03-24T14:53:18Z" | ||
templateScope: local/hello-world-69h5d | ||
type: StepGroup | ||
phase: Running | ||
startedAt: "2021-03-24T14:53:18Z" | ||
` | ||
|
||
// Previously we used `parentNodeDisplayName` to generate all onExit node names. However, as these can be non-unique | ||
// we transitioned to using `parentNodeName` instead, which are guaranteed to be unique. In order to not disrupt | ||
// running workflows during upgrade time, we first check if there is an onExit node that currently exists with the | ||
// legacy name AND said node is a child of the parent node. If it does, we continue execution with the legacy name. | ||
// If it doesn't, we use the new (and unique) name for all operations henceforth. | ||
// | ||
// Here we test to see if this backwards compatibility works. This test workflow contains a running onExit node with the | ||
// old name. When we call operate on it, we should NOT create another onExit node with the new name and instead respect | ||
// the old onExit node. | ||
// | ||
// TODO: This test should be removed after a couple of "grace period" version upgrades to allow transitions. It was introduced in v3.0.0 | ||
// See more: https://github.com/argoproj/argo-workflows/issues/5502 | ||
func TestOnExitNameBackwardsCompatibility(t *testing.T) { | ||
wf := unmarshalWF(testOnExitNameBackwardsCompatibility) | ||
cancel, controller := newController(wf) | ||
defer cancel() | ||
|
||
ctx := context.Background() | ||
woc := newWorkflowOperationCtx(wf, controller) | ||
|
||
createRunningPods(ctx, woc) | ||
|
||
nodesBeforeOperation := len(woc.wf.Status.Nodes) | ||
woc.operate(ctx) | ||
|
||
assert.Equal(t, wfv1.WorkflowRunning, woc.wf.Status.Phase) | ||
// Number of nodes should not change (no new name node was created) | ||
assert.Equal(t, nodesBeforeOperation, len(woc.wf.Status.Nodes)) | ||
node := woc.wf.Status.Nodes.FindByDisplayName("run.onExit") | ||
if assert.NotNil(t, node) { | ||
assert.Equal(t, wfv1.NodeRunning, node.Phase) | ||
} | ||
} | ||
|
||
const testOnExitDAGStatusCompatibility = ` | ||
apiVersion: argoproj.io/v1alpha1 | ||
kind: Workflow | ||
metadata: | ||
name: dag-diamond-8xw8l | ||
spec: | ||
entrypoint: diamond | ||
templates: | ||
- dag: | ||
tasks: | ||
- name: A | ||
onExit: echo | ||
template: echo | ||
- depends: A | ||
name: B | ||
template: echo | ||
name: diamond | ||
- container: | ||
command: | ||
- echo | ||
- hi | ||
image: alpine:3.7 | ||
name: echo | ||
status: | ||
nodes: | ||
dag-diamond-8xw8l: | ||
children: | ||
- dag-diamond-8xw8l-1488416551 | ||
displayName: dag-diamond-8xw8l | ||
finishedAt: "2021-03-24T15:37:06Z" | ||
id: dag-diamond-8xw8l | ||
name: dag-diamond-8xw8l | ||
outboundNodes: | ||
- dag-diamond-8xw8l-1505194170 | ||
phase: Running | ||
startedAt: "2021-03-24T15:36:47Z" | ||
templateName: diamond | ||
templateScope: local/dag-diamond-8xw8l | ||
type: DAG | ||
dag-diamond-8xw8l-1342580575: | ||
boundaryID: dag-diamond-8xw8l | ||
displayName: A.onExit | ||
finishedAt: "2021-03-24T15:36:59Z" | ||
hostNodeName: k3d-k3s-default-server-0 | ||
id: dag-diamond-8xw8l-1342580575 | ||
name: A.onExit | ||
phase: Running | ||
startedAt: "2021-03-24T15:36:54Z" | ||
templateName: echo | ||
templateScope: local/dag-diamond-8xw8l | ||
type: Pod | ||
dag-diamond-8xw8l-1488416551: | ||
boundaryID: dag-diamond-8xw8l | ||
children: | ||
- dag-diamond-8xw8l-1342580575 | ||
displayName: A | ||
finishedAt: "2021-03-24T15:36:53Z" | ||
hostNodeName: k3d-k3s-default-server-0 | ||
id: dag-diamond-8xw8l-1488416551 | ||
name: dag-diamond-8xw8l.A | ||
phase: Succeeded | ||
startedAt: "2021-03-24T15:36:47Z" | ||
templateName: echo | ||
templateScope: local/dag-diamond-8xw8l | ||
type: Pod | ||
phase: Running | ||
startedAt: "2021-03-24T15:36:47Z" | ||
` | ||
|
||
// Previously we used `parentNodeDisplayName` to generate all onExit node names. However, as these can be non-unique | ||
// we transitioned to using `parentNodeName` instead, which are guaranteed to be unique. In order to not disrupt | ||
// running workflows during upgrade time, we first check if there is an onExit node that currently exists with the | ||
// legacy name AND said node is a child of the parent node. If it does, we continue execution with the legacy name. | ||
// If it doesn't, we use the new (and unique) name for all operations henceforth. | ||
// | ||
// Here we test to see if this backwards compatibility works. This test workflow contains a running onExit node with the | ||
// old name. When we call operate on it, we should NOT create the subsequent DAG done ("B") until the onExit node name with | ||
// the old name finishes running. | ||
// | ||
// TODO: This test should be removed after a couple of "grace period" version upgrades to allow transitions. It was introduced in v3.0.0 | ||
// See more: https://github.com/argoproj/argo-workflows/issues/5502 | ||
func TestOnExitDAGStatusCompatibility(t *testing.T) { | ||
wf := unmarshalWF(testOnExitDAGStatusCompatibility) | ||
cancel, controller := newController(wf) | ||
defer cancel() | ||
|
||
ctx := context.Background() | ||
woc := newWorkflowOperationCtx(wf, controller) | ||
|
||
createRunningPods(ctx, woc) | ||
|
||
nodesBeforeOperation := len(woc.wf.Status.Nodes) | ||
woc.operate(ctx) | ||
|
||
assert.Equal(t, wfv1.WorkflowRunning, woc.wf.Status.Phase) | ||
// Number of nodes should not change (no new name node was created) | ||
assert.Equal(t, nodesBeforeOperation, len(woc.wf.Status.Nodes)) | ||
node := woc.wf.Status.Nodes.FindByDisplayName("A.onExit") | ||
if assert.NotNil(t, node) { | ||
assert.Equal(t, wfv1.NodeRunning, node.Phase) | ||
} | ||
|
||
nodeB := woc.wf.Status.Nodes.FindByDisplayName("B") | ||
assert.Nil(t, nodeB) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unit test please
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Working on it right now, will push soon