Skip to content

Commit

Permalink
feat(controller): Pod deletion grace period. Fixes #4719 (#4725)
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Collins <alex_collins@intuit.com>
  • Loading branch information
alexec committed Dec 18, 2020
1 parent 9a7e044 commit 11bc9c4
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 10 deletions.
21 changes: 11 additions & 10 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -903,21 +903,18 @@ func (woc *wfOperationCtx) podReconciliation() error {
}
if _, ok := seenPods[nodeID]; !ok {

// grace-period to allow informer sync
recentlyStarted := recentlyStarted(node)
woc.log.WithFields(log.Fields{"nodeName": node.Name, "nodePhase": node.Phase, "recentlyStarted": recentlyStarted}).Info("Workflow pod is missing")
metrics.PodMissingMetric.WithLabelValues(strconv.FormatBool(recentlyStarted), string(node.Phase)).Inc()

// If the node is pending and the pod does not exist, it could be the case that we want to try to submit it
// again instead of marking it as an error. Check if that's the case.
if node.Pending() {
if node.Pending() || recentlyStarted {
continue
}

node.Message = "pod deleted"
node.Phase = wfv1.NodeError
// FinishedAt must be set since retry strategy depends on it to determine the backoff duration.
// See processNodeRetries for more details.
node.FinishedAt = metav1.Time{Time: time.Now().UTC()}
woc.wf.Status.Nodes[nodeID] = node
woc.log.WithField("displayName", node.DisplayName).WithField("templateName", node.TemplateName).
WithField("node", node.Name).Error("Pod for node deleted")
woc.updated = true
woc.markNodePhase(node.Name, wfv1.NodeError, "pod deleted")
} else {
// At this point we are certain that the pod associated with our node is running or has been run;
// it is safe to extract the k8s-node information given this knowledge.
Expand All @@ -931,6 +928,10 @@ func (woc *wfOperationCtx) podReconciliation() error {
return nil
}

func recentlyStarted(node wfv1.NodeStatus) bool {
return time.Since(node.StartedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_STARTED_POD_DURATION", 10*time.Second)
}

// shouldPrintPodSpec return eligible to print to the pod spec
func (woc *wfOperationCtx) shouldPrintPodSpec(node wfv1.NodeStatus) bool {
return woc.controller.Config.PodSpecLogStrategy.AllPods ||
Expand Down
14 changes: 14 additions & 0 deletions workflow/metrics/pod_missing_metric.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package metrics

import "github.com/prometheus/client_golang/prometheus"

var (
PodMissingMetric = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: argoNamespace,
Name: "pod_missing",
Help: "Incidents of pod missing",
},
[]string{"recently_started", "node_phase"},
)
)
2 changes: 2 additions & 0 deletions workflow/metrics/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,15 @@ func (m *Metrics) Describe(ch chan<- *prometheus.Desc) {
ch <- metric.Desc()
}
m.logMetric.Describe(ch)
PodMissingMetric.Describe(ch)
}

func (m *Metrics) Collect(ch chan<- prometheus.Metric) {
for _, metric := range m.allMetrics() {
ch <- metric
}
m.logMetric.Collect(ch)
PodMissingMetric.Collect(ch)
}

func (m *Metrics) garbageCollector(ctx context.Context) {
Expand Down

0 comments on commit 11bc9c4

Please sign in to comment.