From 5c85fd36625fb7bbf7d85513a663c181bf8dc5c5 Mon Sep 17 00:00:00 2001 From: Dillen Padhiar <38965141+dpadhiar@users.noreply.github.com> Date: Tue, 18 Jun 2024 15:06:56 -0700 Subject: [PATCH] fix: process metrics later in `executeTemplate`. Fixes #13162 (#13163) Signed-off-by: Dillen Padhiar (cherry picked from commit 6201d759ebb32ba543c01665c142d04b0428f227) --- test/e2e/metrics_test.go | 17 +++++++ ...late-status-failed-conditional-metric.yaml | 31 +++++++++++++ workflow/controller/operator.go | 46 +++++++++---------- 3 files changed, 71 insertions(+), 23 deletions(-) create mode 100644 test/e2e/testdata/template-status-failed-conditional-metric.yaml diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index febdf06f38e6..e8140c44fac6 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -94,6 +94,23 @@ func (s *MetricsSuite) TestDAGMetrics() { }) } +func (s *MetricsSuite) TestFailedMetric() { + s.Given(). + Workflow(`@testdata/template-status-failed-conditional-metric.yaml`). + When(). + SubmitWorkflow(). + WaitForWorkflow(fixtures.ToBeFailed). + Then(). + ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *wfv1.WorkflowStatus) { + assert.Equal(t, wfv1.WorkflowFailed, status.Phase) + s.e(s.T()).GET(""). + Expect(). + Status(200). + Body(). + Contains(`argo_workflows_task_failure 1`) + }) +} + func TestMetricsSuite(t *testing.T) { suite.Run(t, new(MetricsSuite)) } diff --git a/test/e2e/testdata/template-status-failed-conditional-metric.yaml b/test/e2e/testdata/template-status-failed-conditional-metric.yaml new file mode 100644 index 000000000000..7fd9a166b1e6 --- /dev/null +++ b/test/e2e/testdata/template-status-failed-conditional-metric.yaml @@ -0,0 +1,31 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: template-status-failed-conditional-metric- +spec: + entrypoint: dag + templates: + - name: dag + metrics: + prometheus: + - counter: + value: "1" + help: Task failed + name: task_failure + when: '{{status}} == Failed' + dag: + tasks: + - name: test + template: echo + arguments: + parameters: + - name: message + value: "test" + + - name: echo + inputs: + parameters: + - name: message + container: + image: alpine:3.7 + command: ["{{inputs.parameters.message}}"] \ No newline at end of file diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index 6798c7a9dffa..564da9938947 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -2214,29 +2214,6 @@ func (woc *wfOperationCtx) executeTemplate(ctx context.Context, nodeName string, woc.controller.syncManager.Release(woc.wf, node.ID, processedTmpl.Synchronization) } - if processedTmpl.Metrics != nil { - // Check if the node was just created, if it was emit realtime metrics. - // If the node did not previously exist, we can infer that it was created during the current operation, emit real time metrics. - if _, ok := woc.preExecutionNodePhases[node.ID]; !ok { - localScope, realTimeScope := woc.prepareMetricScope(node) - woc.computeMetrics(processedTmpl.Metrics.Prometheus, localScope, realTimeScope, true) - } - // Check if the node completed during this execution, if it did emit metrics - // - // This check is necessary because sometimes a node will be marked completed during the current execution and will - // not be considered again. The best example of this is the entrypoint steps/dag template (once completed, the - // workflow ends and it's not reconsidered). This checks makes sure that its metrics also get emitted. - // - // In this check, a completed node may or may not have existed prior to this execution. If it did exist, ensure that it wasn't - // completed before this execution. If it did not exist prior, then we can infer that it was completed during this execution. - // The statement "(!ok || !prevNodeStatus.Fulfilled())" checks for this behavior and represents the material conditional - // "ok -> !prevNodeStatus.Fulfilled()" (https://en.wikipedia.org/wiki/Material_conditional) - if prevNodeStatus, ok := woc.preExecutionNodePhases[node.ID]; (!ok || !prevNodeStatus.Fulfilled()) && node.Fulfilled() { - localScope, realTimeScope := woc.prepareMetricScope(node) - woc.computeMetrics(processedTmpl.Metrics.Prometheus, localScope, realTimeScope, false) - } - } - retrieveNode, err := woc.wf.GetNodeByName(node.Name) if err != nil { err := fmt.Errorf("no Node found by the name of %s; wf.Status.Nodes=%+v", node.Name, woc.wf.Status.Nodes) @@ -2266,6 +2243,29 @@ func (woc *wfOperationCtx) executeTemplate(ctx context.Context, nodeName string, node = retryNode } + if processedTmpl.Metrics != nil { + // Check if the node was just created, if it was emit realtime metrics. + // If the node did not previously exist, we can infer that it was created during the current operation, emit real time metrics. + if _, ok := woc.preExecutionNodePhases[node.ID]; !ok { + localScope, realTimeScope := woc.prepareMetricScope(node) + woc.computeMetrics(processedTmpl.Metrics.Prometheus, localScope, realTimeScope, true) + } + // Check if the node completed during this execution, if it did emit metrics + // + // This check is necessary because sometimes a node will be marked completed during the current execution and will + // not be considered again. The best example of this is the entrypoint steps/dag template (once completed, the + // workflow ends and it's not reconsidered). This checks makes sure that its metrics also get emitted. + // + // In this check, a completed node may or may not have existed prior to this execution. If it did exist, ensure that it wasn't + // completed before this execution. If it did not exist prior, then we can infer that it was completed during this execution. + // The statement "(!ok || !prevNodeStatus.Fulfilled())" checks for this behavior and represents the material conditional + // "ok -> !prevNodeStatus.Fulfilled()" (https://en.wikipedia.org/wiki/Material_conditional) + if prevNodeStatus, ok := woc.preExecutionNodePhases[node.ID]; (!ok || !prevNodeStatus.Fulfilled()) && node.Fulfilled() { + localScope, realTimeScope := woc.prepareMetricScope(node) + woc.computeMetrics(processedTmpl.Metrics.Prometheus, localScope, realTimeScope, false) + } + } + return node, nil }