Skip to content

Commit

Permalink
Check for other init container failures
Browse files Browse the repository at this point in the history
  • Loading branch information
DrJosh9000 committed Dec 3, 2024
1 parent 71b1c9b commit 7cdeeb7
Show file tree
Hide file tree
Showing 7 changed files with 209 additions and 50 deletions.
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/go-playground/universal-translator v0.18.1
github.com/go-playground/validator/v10 v10.23.0
github.com/google/uuid v1.6.0
github.com/jedib0t/go-pretty/v6 v6.6.3
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51
github.com/spf13/cobra v1.8.1
github.com/spf13/viper v1.19.0
Expand Down Expand Up @@ -93,12 +94,14 @@ require (
github.com/lufia/plan9stats v0.0.0-20220913051719-115f729f3c8c // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/outcaste-io/ristretto v0.2.3 // indirect
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
github.com/power-devops/perfstat v0.0.0-20220216144756-c35f1ee13d7c // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/rivo/uniseg v0.4.4 // indirect
github.com/ryanuber/go-glob v1.0.0 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
Expand Down
7 changes: 7 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,8 @@ github.com/imdario/mergo v0.3.12 h1:b6R2BslTbIEToALKP7LxUvijTsNI9TAe80pLWN2g/HU=
github.com/imdario/mergo v0.3.12/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/jedib0t/go-pretty/v6 v6.6.3 h1:nGqgS0tgIO1Hto47HSaaK4ac/I/Bu7usmdD3qvs0WvM=
github.com/jedib0t/go-pretty/v6 v6.6.3/go.mod h1:zbn98qrYlh95FIhwwsbIip0LYpwSG8SUOScs+v9/t0E=
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
Expand Down Expand Up @@ -351,6 +353,8 @@ github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/mattn/go-zglob v0.0.6 h1:mP8RnmCgho4oaUYDIDn6GNxYk+qJGUs8fJLn+twYj2A=
github.com/mattn/go-zglob v0.0.6/go.mod h1:MxxjyoXXnMxfIpxTK2GAkw1w8glPsQILx3N5wrKakiY=
github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc=
Expand Down Expand Up @@ -410,6 +414,9 @@ github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/richardartoul/molecule v1.0.1-0.20240531184615-7ca0df43c0b3 h1:4+LEVOB87y175cLJC/mbsgKmoDOjrBldtXvioEy96WY=
github.com/richardartoul/molecule v1.0.1-0.20240531184615-7ca0df43c0b3/go.mod h1:vl5+MqJ1nBINuSsUI2mGgH79UweUT/B5Fy8857PqyyI=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
Expand Down
185 changes: 136 additions & 49 deletions internal/controller/scheduler/pod_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (

"github.com/Khan/genqlient/graphql"
"github.com/google/uuid"
"github.com/jedib0t/go-pretty/v6/table"
"go.uber.org/zap"
corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
Expand Down Expand Up @@ -62,10 +63,12 @@ type podWatcher struct {

// NewPodWatcher creates an informer that does various things with pods and
// Buildkite jobs:
// - If a container stays in ImagePullBackOff state for too long, the Buildkite
// - If an init container fails, the BK Agent REST API will be used to fail
// the job (since an agent hasn't run yet).
// - If a container stays in ImagePullBackOff state for too long, the BK
// Agent REST API will be used to fail the job and the pod will be evicted.
// - If a container stays in ImagePullBackOff, and the pod somehow got through
// all the init containers (including the image pull checks...) the Buildkite
// all the init containers (including the image pull checks...) the BK
// GraphQL API will be used to cancel the job instead.
// - If a pod is pending, every so often Buildkite will be checked to see if
// the corresponding job has been cancelled so that the pod can be evicted
Expand Down Expand Up @@ -129,6 +132,9 @@ func (w *podWatcher) OnDelete(maybePod any) {
}

w.stopJobCancelChecker(jobUUID)

// The pod is gone, so we can stop ignoring it (if it comes back).
w.unignoreJob(jobUUID)
}

func (w *podWatcher) OnAdd(maybePod any, isInInitialList bool) {
Expand Down Expand Up @@ -165,9 +171,13 @@ func (w *podWatcher) runChecks(ctx context.Context, pod *corev1.Pod) {
return
}

// Check for a container stuck in ImagePullBackOff, and fail or cancel
// the job accordingly.
w.cancelImagePullBackOff(ctx, log, pod, jobUUID)
// Check for an init container that failed for any reason.
// (Note: users can define their own init containers through podSpec.)
w.failOnInitContainerFailure(ctx, log, pod, jobUUID)

// Check for a container stuck in ImagePullBackOff or InvalidImageName,
// and fail or cancel the job accordingly.
w.failOnImagePullFailure(ctx, log, pod, jobUUID)

// Check whether the agent container has started yet, and start or stop the
// job cancel checker accordingly.
Expand Down Expand Up @@ -208,49 +218,67 @@ func (w *podWatcher) jobUUIDAndLogger(pod *corev1.Pod) (uuid.UUID, *zap.Logger,
return jobUUID, log, nil
}

func (w *podWatcher) cancelImagePullBackOff(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID) {
log.Debug("Checking pod for ImagePullBackOff")
func (w *podWatcher) failOnImagePullFailure(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID) {
log.Debug("Checking pod containers for ImagePullBackOff or InvalidImageName")

if pod.Status.StartTime == nil {
// Status could be unpopulated, or it hasn't started yet.
return
}
startedAt := pod.Status.StartTime.Time
if startedAt.IsZero() || time.Since(startedAt) < w.imagePullBackOffGracePeriod {
// Not started yet, or started recently
return
}
failImmediately := false

images := make(map[string]struct{})

// If any init container fails to pull, whether it's one we added
// specifically to check for pull failure, the pod won't run.
for _, containerStatus := range pod.Status.InitContainerStatuses {
if !shouldCancel(&containerStatus) {
waiting := containerStatus.State.Waiting
if waiting == nil {
continue
}
images[containerStatus.Image] = struct{}{}
switch waiting.Reason {
case "ImagePullBackOff":
images[containerStatus.Image] = struct{}{}
case "InvalidImageName":
images[containerStatus.Image] = struct{}{}
failImmediately = true
}
}

// These containers only run after the init containers have run.
// Theoretically this could still happen even if all the init containers
// successfully pulled.
for _, containerStatus := range pod.Status.ContainerStatuses {
if !shouldCancel(&containerStatus) {
waiting := containerStatus.State.Waiting
if waiting == nil {
continue
}
if !isSystemContainer(&containerStatus) {
log.Info("Ignoring container during ImagePullBackOff watch.", zap.String("name", containerStatus.Name))
continue
switch waiting.Reason {
case "ImagePullBackOff":
if !isSystemContainer(&containerStatus) {
log.Info("Ignoring container during ImagePullBackOff watch.", zap.String("name", containerStatus.Name))
continue
}
images[containerStatus.Image] = struct{}{}
case "InvalidImageName":
images[containerStatus.Image] = struct{}{}
failImmediately = true
}
images[containerStatus.Image] = struct{}{}
}

if len(images) == 0 {
// All's well with the world.
return
}

if !failImmediately { // apply the grace period
if pod.Status.StartTime == nil {
// Status could be unpopulated, or it hasn't started yet.
return
}
startedAt := pod.Status.StartTime.Time
if startedAt.IsZero() || time.Since(startedAt) < w.imagePullBackOffGracePeriod {
// Not started yet, or started recently
return
}
}

// Get the current job state from BK.
// What we do next depends on what state it is in.
resp, err := api.GetCommandJob(ctx, w.gql, jobUUID.String())
Expand All @@ -270,7 +298,16 @@ func (w *podWatcher) cancelImagePullBackOff(ctx context.Context, log *zap.Logger
case api.JobStatesScheduled:
// We can acquire it and fail it ourselves.
log.Info("One or more job containers are in ImagePullBackOff. Failing.")
w.failJob(ctx, log, pod, jobUUID, images)
message := w.formatImagePullFailureMessage(images)
if err := w.failJob(ctx, log, pod, jobUUID, message); errors.Is(err, agentcore.ErrJobAcquisitionRejected) {
// If the error was because BK rejected the acquisition(?), then its probably moved
// on to a state where we need to cancel instead.
log.Info("Attempting to cancel job instead")
w.cancelJob(ctx, log, pod, jobUUID)
return
}
// Also evict the pod, because it won't die on its own.
w.evictPod(ctx, log, pod, jobUUID)

case api.JobStatesAccepted, api.JobStatesAssigned, api.JobStatesRunning:
// An agent is already doing something with the job - now canceling
Expand All @@ -293,14 +330,61 @@ func (w *podWatcher) cancelImagePullBackOff(ctx context.Context, log *zap.Logger
}
}

func (w *podWatcher) failJob(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID, images map[string]struct{}) {
agentToken, err := fetchAgentToken(ctx, w.logger, w.k8s, w.cfg.Namespace, w.cfg.AgentTokenSecret)
if err != nil {
log.Error("Couldn't fetch agent token in order to fail the job", zap.Error(err))
func (w *podWatcher) failOnInitContainerFailure(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID) {
log.Debug("Checking pod for failed init containers")

containerFails := make(map[string]*corev1.ContainerStateTerminated)

// If any init container fails, whether it's one we added specifically to
// check for pull failure or not, the pod won't run.
for _, containerStatus := range pod.Status.InitContainerStatuses {
term := containerStatus.State.Terminated
if term == nil || term.ExitCode == 0 { // not terminated, or succeeded
continue
}
containerFails[containerStatus.Name] = term
}

if len(containerFails) == 0 {
// All's well with the world.
return
}

// Format the failed images into a nice list.
// Attempt to acquire it and fail it ourselves.
// Don't bother checking the current BK state of the job in advance, since
// it should always be api.JobStatesScheduled. Init containers must all
// succeed before the agent container starts.
// If it's not in Scheduled state, acquire will fail, but also that would
// imply something weird is going on with the job (another agent?) and we
// probably shouldn't interfere.
log.Info("One or more init containers failed. Failing.")
message := w.formatInitContainerFails(containerFails)
// failJob logs on error - that's sufficient error handling.
_ = w.failJob(ctx, log, pod, jobUUID, message)
// No need to fall back to cancelling if acquire failed - see above.
// No need to evict, the pod should be considered failed already.
}

func (w *podWatcher) formatInitContainerFails(terms map[string]*corev1.ContainerStateTerminated) string {
keys := make([]string, 0, len(terms))
for k := range terms {
keys = append(keys, k)
}
slices.Sort(keys)

tw := table.NewWriter()
tw.SetStyle(table.StyleColoredDark)
tw.AppendHeader(table.Row{"CONTAINER", "EXIT CODE", "SIGNAL", "REASON", "MESSAGE"})
tw.AppendSeparator()
for _, key := range keys {
term := terms[key]
tw.AppendRow(table.Row{key, term.ExitCode, term.Signal, term.Reason, term.Message})
}
return "The following init containers failed:\n\n" + tw.Render()
}

func (w *podWatcher) formatImagePullFailureMessage(images map[string]struct{}) string {
// Format the failed images into a nice sorted list.
imagesList := make([]string, 0, len(images))
for image := range images {
imagesList = append(imagesList, image)
Expand All @@ -309,27 +393,32 @@ func (w *podWatcher) failJob(ctx context.Context, log *zap.Logger, pod *corev1.P
var message strings.Builder
message.WriteString("The following container images couldn't be pulled:\n")
for _, image := range imagesList {
fmt.Fprintf(&message, " * %s\n", image)
fmt.Fprintf(&message, " * %q\n", image)
}
return message.String()
}

func (w *podWatcher) failJob(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID, message string) error {
agentToken, err := fetchAgentToken(ctx, w.logger, w.k8s, w.cfg.Namespace, w.cfg.AgentTokenSecret)
if err != nil {
log.Error("Couldn't fetch agent token in order to fail the job", zap.Error(err))
return err
}

// Tags are required order to connect the agent.
tags := agenttags.TagsFromLabels(pod.Labels)
opts := w.cfg.AgentConfig.ControllerOptions()

if err := failJob(ctx, w.logger, agentToken, jobUUID.String(), tags, message.String(), opts...); err != nil {
if err := failJob(ctx, w.logger, agentToken, jobUUID.String(), tags, message, opts...); err != nil {
log.Error("Couldn't fail the job", zap.Error(err))
podWatcherBuildkiteJobFailErrorsCounter.Inc()
// If the error was because BK rejected the acquisition, then its moved
// on to a state where we need to cancel instead.
if errors.Is(err, agentcore.ErrJobAcquisitionRejected) {
log.Info("Attempting to cancel job instead")
w.cancelJob(ctx, log, pod, jobUUID)
}
return
return err
}
podWatcherBuildkiteJobFailsCounter.Inc()
return nil
}

// Let's also evict the pod (request graceful termination).
func (w *podWatcher) evictPod(ctx context.Context, log *zap.Logger, pod *corev1.Pod, jobUUID uuid.UUID) {
eviction := &policyv1.Eviction{
ObjectMeta: pod.ObjectMeta,
}
Expand Down Expand Up @@ -361,11 +450,8 @@ func (w *podWatcher) cancelJob(ctx context.Context, log *zap.Logger, pod *corev1
}
podWatcherBuildkiteJobCancelsCounter.Inc()

// Evicting the pod might prevent the agent from logging its last-gasp
// "it could be ImagePullBackOff" message.
// On the other hand, not evicting the pod will probably leave it running
// indefinitely if there are any sidecars.
// TODO: experiment with adding eviction here.
// Note that evicting the pod might prevent the agent from logging its
// last-gasp "it could be ImagePullBackOff" message.

// We can avoid repeating the GraphQL queries to fetch and cancel the job
// (between cancelling and Kubernetes cleaning up the pod) if we got here.
Expand Down Expand Up @@ -470,6 +556,12 @@ func (w *podWatcher) ignoreJob(jobUUID uuid.UUID) {
w.ignoreJobs[jobUUID] = struct{}{}
}

func (w *podWatcher) unignoreJob(jobUUID uuid.UUID) {
w.ignoreJobsMu.Lock()
defer w.ignoreJobsMu.Unlock()
delete(w.ignoreJobs, jobUUID)
}

// onceChan stores a channel and a [sync.Once] to be used for closing the
// channel at most once.
type onceChan struct {
Expand All @@ -484,11 +576,6 @@ func (oc *onceChan) closeOnce() {
oc.once.Do(func() { close(oc.ch) })
}

func shouldCancel(containerStatus *corev1.ContainerStatus) bool {
return containerStatus.State.Waiting != nil &&
containerStatus.State.Waiting.Reason == "ImagePullBackOff"
}

// All container-\d containers will have the agent installed as their PID 1.
// Therefore, their lifecycle is well monitored in our backend, allowing us to terminate them if they fail to start.
//
Expand Down
17 changes: 17 additions & 0 deletions internal/integration/fixtures/broken-init-container.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
steps:
- label: ":x:"
agents:
queue: "{{.queue}}"
plugins:
- kubernetes:
podSpec:
initContainers:
- name: broken
image: buildkite/agent:latest
command:
- "well this isn't going to work"
containers: # one command needed to make a valid podspec
- name: load-bearing
image: buildkite/agent:latest
command:
- "echo romeo romeo oscar romeo"
3 changes: 3 additions & 0 deletions internal/integration/fixtures/cancel-checker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ steps:
- name: snorlax
image: buildkite/agent:latest
command:
- "/bin/sh"
args:
- "-c"
- "sleep 20"
containers: # one command needed to make a valid podspec
- name: load-bearing
Expand Down
11 changes: 11 additions & 0 deletions internal/integration/fixtures/invalid-image-name.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
steps:
- label: ":x:"
agents:
queue: "{{.queue}}"
plugins:
- kubernetes:
podSpec:
containers:
- image: buildkite/agent:latest plus some extra junk
command:
- "true"
Loading

0 comments on commit 7cdeeb7

Please sign in to comment.