Skip to content

Commit

Permalink
fix(operator): Workflow stuck at running when init container failed. F…
Browse files Browse the repository at this point in the history
…ixes #10045 (#10047)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Saravanan Balasubramanian <sarabala1979@gmail.com>
  • Loading branch information
terrytangyuan authored and sarabala1979 committed Nov 29, 2022
1 parent fd31eb8 commit b19870d
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 0 deletions.
17 changes: 17 additions & 0 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1277,6 +1277,23 @@ func (woc *wfOperationCtx) assessNodeStatus(pod *apiv1.Pod, old *wfv1.NodeStatus
new.Outputs.ExitCode = pointer.StringPtr(fmt.Sprint(*exitCode))
}

// We cannot fail the node until the wait container is finished because it may be busy saving outputs, and these
// would not get captured successfully.
for _, c := range pod.Status.ContainerStatuses {
if c.Name == common.WaitContainerName && c.State.Terminated == nil && new.Phase.Completed() {
woc.log.WithField("new.phase", new.Phase).Info("leaving phase un-changed: wait container is not yet terminated ")
new.Phase = old.Phase
}
}
// If the init container failed, we should mark the node as failed.
for _, c := range pod.Status.InitContainerStatuses {
if c.State.Terminated != nil && int(c.State.Terminated.ExitCode) != 0 {
new.Phase = wfv1.NodeFailed
woc.log.WithField("new.phase", new.Phase).Info("marking node as failed since init container has non-zero exit code")
break
}
}

// if we are transitioning from Pending to a different state, clear out unchanged message
if old.Phase == wfv1.NodePending && new.Phase != wfv1.NodePending && old.Message == new.Message {
new.Message = ""
Expand Down
26 changes: 26 additions & 0 deletions workflow/controller/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1332,6 +1332,32 @@ func TestAssessNodeStatus(t *testing.T) {
},
node: &wfv1.NodeStatus{TemplateName: templateName},
want: wfv1.NodeFailed,
}, {
name: "pod failed - init container failed",
pod: &apiv1.Pod{
Status: apiv1.PodStatus{
InitContainerStatuses: []apiv1.ContainerStatus{
{
Name: common.InitContainerName,
State: apiv1.ContainerState{Terminated: &apiv1.ContainerStateTerminated{ExitCode: 1}},
},
},
ContainerStatuses: []apiv1.ContainerStatus{
{
Name: common.WaitContainerName,
State: apiv1.ContainerState{Terminated: nil},
},
{
Name: common.MainContainerName,
State: apiv1.ContainerState{Terminated: &apiv1.ContainerStateTerminated{ExitCode: 0}},
},
},
Message: "failed since init container failed",
Phase: apiv1.PodFailed,
},
},
node: &wfv1.NodeStatus{TemplateName: templateName},
want: wfv1.NodeFailed,
}, {
name: "pod running",
pod: &apiv1.Pod{
Expand Down

0 comments on commit b19870d

Please sign in to comment.