Skip to content

Commit

Permalink
fix: metrics don't get emitted properly during retry. Fixes #8207 #10463
Browse files Browse the repository at this point in the history
 (#10489)

Signed-off-by: Jiacheng Xu <xjcmaxwellcjx@gmail.com>
Co-authored-by: Saravanan Balasubramanian <33908564+sarabala1979@users.noreply.github.com>
  • Loading branch information
jiachengxu and sarabala1979 authored Mar 27, 2023
1 parent dd2f8cb commit cbd40e7
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 6 deletions.
20 changes: 20 additions & 0 deletions test/e2e/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@ import (
"testing"

"github.com/gavv/httpexpect/v2"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

wfv1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
"github.com/argoproj/argo-workflows/v3/test/e2e/fixtures"
)

Expand Down Expand Up @@ -57,6 +60,23 @@ func (s *MetricsSuite) TestMetricsEndpoint() {
})
}

func (s *MetricsSuite) TestRetryMetrics() {
s.Given().
Workflow(`@testdata/workflow-retry-metrics.yaml`).
When().
SubmitWorkflow().
WaitForWorkflow(fixtures.ToBeSucceeded).
Then().
ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *wfv1.WorkflowStatus) {
assert.Equal(t, wfv1.WorkflowSucceeded, status.Phase)
s.e(s.T()).GET("").
Expect().
Status(200).
Body().
Contains(`runs_exit_status_counter{exit_code="1",status="Failed"} 3`)
})
}

func TestMetricsSuite(t *testing.T) {
suite.Run(t, new(MetricsSuite))
}
38 changes: 38 additions & 0 deletions test/e2e/testdata/workflow-retry-metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: workflow-retry-metrics
spec:
entrypoint: main
podGC:
strategy: OnWorkflowSuccess
templates:
- name: main
steps:
- - continueOn:
error: true
failed: true
name: runTest
template: run-test
- container:
name: runner
image: 'argoproj/argosay:v2'
args:
- exit 1
command:
- sh
- -c
metrics:
prometheus:
- counter:
value: "1"
help: Count of runs by exit code
labels:
- key: exit_code
value: '{{exitCode}}'
- key: status
value: "{{status}}"
name: runs_exit_status_counter
name: run-test
retryStrategy:
limit: "2"
17 changes: 11 additions & 6 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1922,8 +1922,15 @@ func (woc *wfOperationCtx) executeTemplate(ctx context.Context, nodeName string,
return retryParentNode, nil
}
retryParentNode = processedRetryParentNode
lastChildNode := getChildNodeIndex(retryParentNode, woc.wf.Status.Nodes, -1)
// The retry node might have completed by now.
if retryParentNode.Fulfilled() {
// If retry node has completed, set the output of the last child node to its output.
// Runtime parameters (e.g., `status`, `resourceDuration`) in the output will be used to emit metrics.
if lastChildNode != nil {
retryParentNode.Outputs = lastChildNode.Outputs.DeepCopy()
woc.wf.Status.Nodes[node.ID] = *retryParentNode
}
if processedTmpl.Metrics != nil {
// In this check, a completed node may or may not have existed prior to this execution. If it did exist, ensure that it wasn't
// completed before this execution. If it did not exist prior, then we can infer that it was completed during this execution.
Expand All @@ -1937,14 +1944,12 @@ func (woc *wfOperationCtx) executeTemplate(ctx context.Context, nodeName string,
if processedTmpl.Synchronization != nil {
woc.controller.syncManager.Release(woc.wf, node.ID, processedTmpl.Synchronization)
}
lastChildNode := getChildNodeIndex(retryParentNode, woc.wf.Status.Nodes, -1)
if lastChildNode != nil {
retryParentNode.Outputs = lastChildNode.Outputs.DeepCopy()
woc.wf.Status.Nodes[node.ID] = *retryParentNode
}
return retryParentNode, nil
} else if lastChildNode != nil && lastChildNode.Fulfilled() && processedTmpl.Metrics != nil {
// If retry node has not completed and last child node has completed, emit metrics for the last child node.
localScope, realTimeScope := woc.prepareMetricScope(lastChildNode)
woc.computeMetrics(processedTmpl.Metrics.Prometheus, localScope, realTimeScope, false)
}
lastChildNode := getChildNodeIndex(retryParentNode, woc.wf.Status.Nodes, -1)
if lastChildNode != nil && !lastChildNode.Fulfilled() {
// Last child node is still running.
nodeName = lastChildNode.Name
Expand Down

0 comments on commit cbd40e7

Please sign in to comment.