Force restartPolicy "Never" to prevent runner pods from stucking in T…

…erminating when the container disappeared Ref #1369
actions · May 12, 2022 · 988389a · 988389a
1 parent 4053ab3
commit 988389a
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 12 deletions.
diff --git a/controllers/new_runner_pod_test.go b/controllers/new_runner_pod_test.go
@@ -159,7 +159,7 @@ func TestNewRunnerPod(t *testing.T) {
 					},
 				},
 			},
-			RestartPolicy: corev1.RestartPolicyOnFailure,
+			RestartPolicy: corev1.RestartPolicyNever,
 		},
 	}
 
@@ -245,7 +245,7 @@ func TestNewRunnerPod(t *testing.T) {
 					},
 				},
 			},
-			RestartPolicy: corev1.RestartPolicyOnFailure,
+			RestartPolicy: corev1.RestartPolicyNever,
 		},
 	}
 
@@ -327,7 +327,7 @@ func TestNewRunnerPod(t *testing.T) {
 					},
 				},
 			},
-			RestartPolicy: corev1.RestartPolicyOnFailure,
+			RestartPolicy: corev1.RestartPolicyNever,
 		},
 	}
 
@@ -593,7 +593,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) {
 					},
 				},
 			},
-			RestartPolicy: corev1.RestartPolicyOnFailure,
+			RestartPolicy: corev1.RestartPolicyNever,
 		},
 	}
 
@@ -694,7 +694,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) {
 					},
 				},
 			},
-			RestartPolicy: corev1.RestartPolicyOnFailure,
+			RestartPolicy: corev1.RestartPolicyNever,
 		},
 	}
 
@@ -795,7 +795,7 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) {
 					},
 				},
 			},
-			RestartPolicy: corev1.RestartPolicyOnFailure,
+			RestartPolicy: corev1.RestartPolicyNever,
 		},
 	}
 

diff --git a/controllers/pod_runner_token_injector.go b/controllers/pod_runner_token_injector.go
@@ -78,9 +78,7 @@ func (t *PodRunnerTokenInjector) Handle(ctx context.Context, req admission.Reque
 
 	updated.Annotations[AnnotationKeyTokenExpirationDate] = ts
 
-	if pod.Spec.RestartPolicy != corev1.RestartPolicyOnFailure {
-		updated.Spec.RestartPolicy = corev1.RestartPolicyOnFailure
-	}
+	forceRunnerPodRestartPolicyNever(updated)
 
 	buf, err := json.Marshal(updated)
 	if err != nil {

diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go
@@ -629,9 +629,7 @@ func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.Ru
 
 	pod := template.DeepCopy()
 
-	if pod.Spec.RestartPolicy == "" {
-		pod.Spec.RestartPolicy = "OnFailure"
-	}
+	forceRunnerPodRestartPolicyNever(pod)
 
 	if mtu := runnerSpec.DockerMTU; mtu != nil && dockerdInRunner {
 		runnerContainer.Env = append(runnerContainer.Env, []corev1.EnvVar{

diff --git a/controllers/runner_pod.go b/controllers/runner_pod.go
@@ -0,0 +1,22 @@
+package controllers
+
+import corev1 "k8s.io/api/core/v1"
+
+// Force the runner pod managed by either RunnerDeployment and RunnerSet to have restartPolicy=Never.
+// See https://github.com/actions-runner-controller/actions-runner-controller/issues/1369 for more context.
+//
+// This is to prevent runner pods from stucking in Terminating when a K8s node disappeared along with the runnr pod and the runner container within it.
+//
+// Previously, we used restartPolicy of OnFailure, it turned wrong later, and therefore we now set Never.
+//
+// When the restartPolicy is OnFailure and the node disappeared, runner pods on the node seem to stuck in state.terminated==nil, state.waiting!=nil, and state.lastTerminationState!=nil,
+// and will ever become Running.
+// It's probably due to that the node onto which the pods have been scheduled will ever come back, hence the container restart attempt swill ever succeed,
+// the pods stuck waiting for successful restarts forever.
+//
+// By forcing runner pods to never restart, we hope there will be no chances of pods being stuck waiting.
+func forceRunnerPodRestartPolicyNever(pod *corev1.Pod) {
+	if pod.Spec.RestartPolicy != corev1.RestartPolicyNever {
+		pod.Spec.RestartPolicy = corev1.RestartPolicyNever
+	}
+}