diff --git a/go.mod b/go.mod index 7d8ea2bdaefb..4c218862c55a 100644 --- a/go.mod +++ b/go.mod @@ -31,7 +31,7 @@ require ( k8s.io/utils v0.0.0-20240102154912-e7106e64919e knative.dev/pkg v0.0.0-20231010144348-ca8c009405dd sigs.k8s.io/controller-runtime v0.18.4 - sigs.k8s.io/karpenter v0.37.1-0.20240718003825-68561e8a62ad + sigs.k8s.io/karpenter v0.37.1-0.20240719210328-f6ae6b60eba0 sigs.k8s.io/yaml v1.4.0 ) diff --git a/go.sum b/go.sum index 1e43b8c564e4..34fe15db0d9a 100644 --- a/go.sum +++ b/go.sum @@ -761,8 +761,8 @@ sigs.k8s.io/controller-runtime v0.18.4 h1:87+guW1zhvuPLh1PHybKdYFLU0YJp4FhJRmiHv sigs.k8s.io/controller-runtime v0.18.4/go.mod h1:TVoGrfdpbA9VRFaRnKgk9P5/atA0pMwq+f+msb9M8Sg= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/karpenter v0.37.1-0.20240718003825-68561e8a62ad h1:hRDvKSOrF/wkr9cobka/X/YI3b36Oh2j3MgQvQsP7Ws= -sigs.k8s.io/karpenter v0.37.1-0.20240718003825-68561e8a62ad/go.mod h1:qJQHhUDQhzKa9ui7WApu0Fd2GXu70qexuBC24npDXJY= +sigs.k8s.io/karpenter v0.37.1-0.20240719210328-f6ae6b60eba0 h1:ADGqfxEow14+VCVF3tFJI7kyRNGyBhTIBn5oer7dmUE= +sigs.k8s.io/karpenter v0.37.1-0.20240719210328-f6ae6b60eba0/go.mod h1:qJQHhUDQhzKa9ui7WApu0Fd2GXu70qexuBC24npDXJY= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= diff --git a/pkg/controllers/interruption/controller.go b/pkg/controllers/interruption/controller.go index 4e46d371012d..610cbfc9a1d8 100644 --- a/pkg/controllers/interruption/controller.go +++ b/pkg/controllers/interruption/controller.go @@ -191,12 +191,6 @@ func (c *Controller) handleNodeClaim(ctx context.Context, msg messages.Message, // Record metric and event for this action c.notifyForMessage(msg, nodeClaim, node) - actionsPerformed.With( - prometheus.Labels{ - actionTypeLabel: string(action), - metrics.NodePoolLabel: nodeClaim.Labels[karpv1.NodePoolLabelKey], - }, - ).Inc() // Mark the offering as unavailable in the ICE cache since we got a spot interruption warning if msg.Kind() == messages.SpotInterruptionKind { diff --git a/pkg/controllers/interruption/metrics.go b/pkg/controllers/interruption/metrics.go index 9f8122fc16bc..f35668ab9484 100644 --- a/pkg/controllers/interruption/metrics.go +++ b/pkg/controllers/interruption/metrics.go @@ -24,7 +24,6 @@ import ( const ( interruptionSubsystem = "interruption" messageTypeLabel = "message_type" - actionTypeLabel = "action_type" terminationReasonLabel = "interruption" ) @@ -33,7 +32,7 @@ var ( prometheus.CounterOpts{ Namespace: metrics.Namespace, Subsystem: interruptionSubsystem, - Name: "received_messages", + Name: "received_messages_total", Help: "Count of messages received from the SQS queue. Broken down by message type and whether the message was actionable.", }, []string{messageTypeLabel}, @@ -42,7 +41,7 @@ var ( prometheus.CounterOpts{ Namespace: metrics.Namespace, Subsystem: interruptionSubsystem, - Name: "deleted_messages", + Name: "deleted_messages_total", Help: "Count of messages deleted from the SQS queue.", }, ) @@ -50,25 +49,13 @@ var ( prometheus.HistogramOpts{ Namespace: metrics.Namespace, Subsystem: interruptionSubsystem, - Name: "message_latency_time_seconds", + Name: "message_queue_duration_seconds", Help: "Length of time between message creation in queue and an action taken on the message by the controller.", Buckets: metrics.DurationBuckets(), }, ) - actionsPerformed = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: metrics.Namespace, - Subsystem: interruptionSubsystem, - Name: "actions_performed", - Help: "Number of notification actions performed. Labeled by action", - }, - []string{ - actionTypeLabel, - metrics.NodePoolLabel, - }, - ) ) func init() { - crmetrics.Registry.MustRegister(receivedMessages, deletedMessages, messageLatency, actionsPerformed) + crmetrics.Registry.MustRegister(receivedMessages, deletedMessages, messageLatency) } diff --git a/website/content/en/preview/getting-started/getting-started-with-karpenter/karpenter-capacity-dashboard.json b/website/content/en/preview/getting-started/getting-started-with-karpenter/karpenter-capacity-dashboard.json index 7f93053b3206..d1eaa6f93577 100644 --- a/website/content/en/preview/getting-started/getting-started-with-karpenter/karpenter-capacity-dashboard.json +++ b/website/content/en/preview/getting-started/getting-started-with-karpenter/karpenter-capacity-dashboard.json @@ -215,7 +215,7 @@ "uid": "${datasource}" }, "editorMode": "builder", - "expr": "sum by(cluster,nodepool) (karpenter_nodes_terminated{nodepool=~\"$nodepool\"})", + "expr": "sum by(cluster,nodepool) (karpenter_nodes_terminated_total{nodepool=~\"$nodepool\"})", "format": "time_series", "legendFormat": "{{cluster}}", "range": true, @@ -408,7 +408,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by(action,consolidation_type,method)(karpenter_disruption_actions_performed_total)", + "expr": "sum by(action,consolidation_type,method)(karpenter_disruption_decisions_total)", "legendFormat": "{{label_name}}", "range": true, "refId": "A" @@ -417,102 +417,6 @@ "title": "Disruption Actions Performed", "type": "timeseries" }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "description": "See: https://karpenter.sh/v0.35/concepts/disruption/#automated-methods", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 22 - }, - "id": 17, - "options": { - "legend": { - "calcs": [ - "last" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "editorMode": "code", - "expr": "sum by(action,consolidation_type,method)(karpenter_disruption_nodes_disrupted_total{nodepool=~\"$nodepool\"})", - "legendFormat": "{{label_name}}", - "range": true, - "refId": "A" - } - ], - "title": "Voluntary Node Disruptions: nodepool \"$nodepool\"", - "type": "timeseries" - }, { "datasource": { "type": "prometheus", @@ -1609,7 +1513,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(karpenter_disruption_actions_performed_total,method)", + "definition": "label_values(karpenter_disruption_decisions_total,method)", "hide": 0, "includeAll": true, "multi": true, @@ -1617,7 +1521,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(karpenter_disruption_actions_performed_total,method)", + "query": "label_values(karpenter_disruption_decisions_total,method)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 2, diff --git a/website/content/en/preview/reference/metrics.md b/website/content/en/preview/reference/metrics.md index 9cc551405eba..76622886c325 100644 --- a/website/content/en/preview/reference/metrics.md +++ b/website/content/en/preview/reference/metrics.md @@ -24,7 +24,7 @@ The nodepool limits are the limits specified on the nodepool that restrict the q ### `karpenter_nodeclaims_termination_duration_seconds` Duration of NodeClaim termination in seconds. -### `karpenter_nodeclaims_terminated` +### `karpenter_nodeclaims_terminated_total` Number of nodeclaims terminated in total by Karpenter. Labeled by reason the nodeclaim was terminated and the owning nodepool. ### `karpenter_nodeclaims_registered` @@ -65,7 +65,7 @@ Node total daemon limits are the resources specified by DaemonSet pod limits. ### `karpenter_nodes_termination_time_seconds` The time taken between a node's deletion request and the removal of its finalizer -### `karpenter_nodes_terminated` +### `karpenter_nodes_terminated_total` Number of nodes terminated in total by Karpenter. Labeled by owning nodepool. ### `karpenter_nodes_system_overhead` @@ -104,24 +104,21 @@ Duration of scheduling process in seconds. ## Interruption Metrics -### `karpenter_interruption_received_messages` +### `karpenter_interruption_received_messages_total` Count of messages received from the SQS queue. Broken down by message type and whether the message was actionable. -### `karpenter_interruption_message_latency_time_seconds` +### `karpenter_interruption_message_queue_duration_seconds` Length of time between message creation in queue and an action taken on the message by the controller. -### `karpenter_interruption_deleted_messages` +### `karpenter_interruption_deleted_messages_total` Count of messages deleted from the SQS queue. -### `karpenter_interruption_actions_performed` -Number of notification actions performed. Labeled by action - ## Disruption Metrics ### `karpenter_disruption_replacement_nodeclaim_initialized_seconds` Amount of time required for a replacement nodeclaim to become initialized. -### `karpenter_disruption_replacement_nodeclaim_failures_total` +### `karpenter_disruption_queue_failures_total` The number of times that Karpenter failed to launch a replacement node for disruption. Labeled by disruption method. ### `karpenter_disruption_queue_depth` @@ -130,9 +127,6 @@ The number of commands currently being waited on in the disruption orchestration ### `karpenter_disruption_pods_disrupted_total` Total number of reschedulable pods disrupted on nodes. Labeled by NodePool, disruption action, method, and consolidation type. -### `karpenter_disruption_nodes_disrupted_total` -Total number of nodes disrupted. Labeled by NodePool, disruption action, method, and consolidation type. - ### `karpenter_disruption_evaluation_duration_seconds` Duration of the disruption evaluation process in seconds. Labeled by method and consolidation type. @@ -142,10 +136,10 @@ Number of nodes eligible for disruption by Karpenter. Labeled by disruption meth ### `karpenter_disruption_consolidation_timeouts_total` Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type. -### `karpenter_disruption_budgets_allowed_disruptions` +### `karpenter_nodepools_allowed_disruptions` The number of nodes for a given NodePool that can be disrupted at a point in time. Labeled by NodePool. Note that allowed disruptions can change very rapidly, as new nodes may be created and others may be deleted at any point. -### `karpenter_disruption_actions_performed_total` +### `karpenter_disruption_decisions_total` Number of disruption actions performed. Labeled by disruption action, method, and consolidation type. ## Consistency Metrics