Skip to content

Commit

Permalink
fix: force restartPolicy "Never" to prevent runner pods from stucking…
Browse files Browse the repository at this point in the history
… in Terminating when the container disappeared (#1395)

Ref #1369
  • Loading branch information
mumoshu authored May 14, 2022
1 parent 3014e98 commit 759349d
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 12 deletions.
12 changes: 6 additions & 6 deletions controllers/new_runner_pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ func TestNewRunnerPod(t *testing.T) {
},
},
},
RestartPolicy: corev1.RestartPolicyOnFailure,
RestartPolicy: corev1.RestartPolicyNever,
},
}

Expand Down Expand Up @@ -237,7 +237,7 @@ func TestNewRunnerPod(t *testing.T) {
},
},
},
RestartPolicy: corev1.RestartPolicyOnFailure,
RestartPolicy: corev1.RestartPolicyNever,
},
}

Expand Down Expand Up @@ -315,7 +315,7 @@ func TestNewRunnerPod(t *testing.T) {
},
},
},
RestartPolicy: corev1.RestartPolicyOnFailure,
RestartPolicy: corev1.RestartPolicyNever,
},
}

Expand Down Expand Up @@ -576,7 +576,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) {
},
},
},
RestartPolicy: corev1.RestartPolicyOnFailure,
RestartPolicy: corev1.RestartPolicyNever,
},
}

Expand Down Expand Up @@ -673,7 +673,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) {
},
},
},
RestartPolicy: corev1.RestartPolicyOnFailure,
RestartPolicy: corev1.RestartPolicyNever,
},
}

Expand Down Expand Up @@ -770,7 +770,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) {
},
},
},
RestartPolicy: corev1.RestartPolicyOnFailure,
RestartPolicy: corev1.RestartPolicyNever,
},
}

Expand Down
4 changes: 1 addition & 3 deletions controllers/pod_runner_token_injector.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,7 @@ func (t *PodRunnerTokenInjector) Handle(ctx context.Context, req admission.Reque

updated.Annotations[AnnotationKeyTokenExpirationDate] = ts

if pod.Spec.RestartPolicy != corev1.RestartPolicyOnFailure {
updated.Spec.RestartPolicy = corev1.RestartPolicyOnFailure
}
forceRunnerPodRestartPolicyNever(updated)

buf, err := json.Marshal(updated)
if err != nil {
Expand Down
4 changes: 1 addition & 3 deletions controllers/runner_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -676,9 +676,7 @@ func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.Ru

pod := template.DeepCopy()

if pod.Spec.RestartPolicy == "" {
pod.Spec.RestartPolicy = "OnFailure"
}
forceRunnerPodRestartPolicyNever(pod)

if mtu := runnerSpec.DockerMTU; mtu != nil && dockerdInRunner {
runnerContainer.Env = append(runnerContainer.Env, []corev1.EnvVar{
Expand Down
22 changes: 22 additions & 0 deletions controllers/runner_pod.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package controllers

import corev1 "k8s.io/api/core/v1"

// Force the runner pod managed by either RunnerDeployment and RunnerSet to have restartPolicy=Never.
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/1369 for more context.
//
// This is to prevent runner pods from stucking in Terminating when a K8s node disappeared along with the runnr pod and the runner container within it.
//
// Previously, we used restartPolicy of OnFailure, it turned wrong later, and therefore we now set Never.
//
// When the restartPolicy is OnFailure and the node disappeared, runner pods on the node seem to stuck in state.terminated==nil, state.waiting!=nil, and state.lastTerminationState!=nil,
// and will ever become Running.
// It's probably due to that the node onto which the pods have been scheduled will ever come back, hence the container restart attempt swill ever succeed,
// the pods stuck waiting for successful restarts forever.
//
// By forcing runner pods to never restart, we hope there will be no chances of pods being stuck waiting.
func forceRunnerPodRestartPolicyNever(pod *corev1.Pod) {
if pod.Spec.RestartPolicy != corev1.RestartPolicyNever {
pod.Spec.RestartPolicy = corev1.RestartPolicyNever
}
}

0 comments on commit 759349d

Please sign in to comment.