From 25ca5febefaec5cd93da03486d643c14e2474ab3 Mon Sep 17 00:00:00 2001 From: vladikkuzn Date: Thu, 11 Jul 2024 12:20:26 +0300 Subject: [PATCH] Add a metric that tracks the number of preemptions issued by a ClusterQueue * Merge metric for preempted workloads into ReportPreemption * test helper --- pkg/metrics/metrics.go | 5 +++-- pkg/metrics/metrics_test.go | 20 ++++++++++++------- pkg/scheduler/preemption/preemption.go | 6 ++---- test/util/util.go | 27 ++++++++++---------------- 4 files changed, 28 insertions(+), 30 deletions(-) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 44472ceb57..487b935b22 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -275,8 +275,9 @@ func ReportEvictedWorkloads(cqName, reason string) { EvictedWorkloadsTotal.WithLabelValues(cqName, reason).Inc() } -func ReportPreemptedWorkloads(preemptorCqName, reason string) { - PreemptedWorkloadsTotal.WithLabelValues(preemptorCqName, reason).Inc() +func ReportPreemption(preemptingCqName, preemptingReason, targetCqName string) { + PreemptedWorkloadsTotal.WithLabelValues(preemptingCqName, preemptingReason).Inc() + ReportEvictedWorkloads(targetCqName, kueue.WorkloadEvictedByPreemption) } func ClearQueueSystemMetrics(cqName string) { diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 2f41760ad3..d3bbd780b9 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -154,19 +154,25 @@ func TestReportAndCleanupClusterQueueEvictedNumber(t *testing.T) { expectFilteredMetricsCount(t, EvictedWorkloadsTotal, 2, "cluster_queue", "cluster_queue1") expectFilteredMetricsCount(t, EvictedWorkloadsTotal, 1, "cluster_queue", "cluster_queue1", "reason", "Preempted") expectFilteredMetricsCount(t, EvictedWorkloadsTotal, 1, "cluster_queue", "cluster_queue1", "reason", "Evicted") - // clear + ClearQueueSystemMetrics("cluster_queue1") expectFilteredMetricsCount(t, EvictedWorkloadsTotal, 0, "cluster_queue", "cluster_queue1") } func TestReportAndCleanupClusterQueuePreemptedNumber(t *testing.T) { - ReportPreemptedWorkloads("cluster_queue1", "InCohortReclamation") - ReportPreemptedWorkloads("cluster_queue1", "InCohortFairSharing") - - expectFilteredMetricsCount(t, PreemptedWorkloadsTotal, 2, "preemptor_cluster_queue", "cluster_queue1") - expectFilteredMetricsCount(t, PreemptedWorkloadsTotal, 1, "preemptor_cluster_queue", "cluster_queue1", "reason", "InCohortReclamation") + ReportPreemption("cluster_queue1", "InClusterQueue", "cluster_queue1") + ReportPreemption("cluster_queue1", "InCohortReclamation", "cluster_queue1") + ReportPreemption("cluster_queue1", "InCohortFairSharing", "cluster_queue1") + ReportPreemption("cluster_queue1", "InCohortReclaimWhileBorrowing", "cluster_queue1") + + expectFilteredMetricsCount(t, PreemptedWorkloadsTotal, 4, "preemptor_cluster_queue", "cluster_queue1") + expectFilteredMetricsCount(t, EvictedWorkloadsTotal, 1, "cluster_queue", "cluster_queue1") + expectFilteredMetricsCount(t, PreemptedWorkloadsTotal, 1, "preemptor_cluster_queue", "cluster_queue1", "reason", "InClusterQueue") expectFilteredMetricsCount(t, PreemptedWorkloadsTotal, 1, "preemptor_cluster_queue", "cluster_queue1", "reason", "InCohortFairSharing") - // clear + expectFilteredMetricsCount(t, PreemptedWorkloadsTotal, 1, "preemptor_cluster_queue", "cluster_queue1", "reason", "InCohortReclamation") + expectFilteredMetricsCount(t, PreemptedWorkloadsTotal, 1, "preemptor_cluster_queue", "cluster_queue1", "reason", "InCohortReclaimWhileBorrowing") + ClearQueueSystemMetrics("cluster_queue1") expectFilteredMetricsCount(t, PreemptedWorkloadsTotal, 0, "preemptor_cluster_queue", "cluster_queue1") + expectFilteredMetricsCount(t, EvictedWorkloadsTotal, 0, "cluster_queue", "cluster_queue1") } diff --git a/pkg/scheduler/preemption/preemption.go b/pkg/scheduler/preemption/preemption.go index 3003539b52..be4a62aa29 100644 --- a/pkg/scheduler/preemption/preemption.go +++ b/pkg/scheduler/preemption/preemption.go @@ -186,11 +186,9 @@ func (p *Preemptor) IssuePreemptions(ctx context.Context, preemptor *workload.In return } - log.V(3).Info("Preempted", "targetWorkload", klog.KObj(target.Obj), "reason", reason, "message", message) - metrics.ReportPreemptedWorkloads(preemptor.ClusterQueue, reason) - + log.V(3).Info("Preempted", "targetWorkload", klog.KObj(target.Obj), "reason", reason, "message", message, "targetClusterQueue", target.ClusterQueue) p.recorder.Eventf(target.Obj, corev1.EventTypeNormal, "Preempted", message) - metrics.ReportEvictedWorkloads(target.ClusterQueue, kueue.WorkloadEvictedByPreemption) + metrics.ReportPreemption(preemptor.ClusterQueue, reason, target.ClusterQueue) } else { log.V(3).Info("Preemption ongoing", "targetWorkload", klog.KObj(target.Obj)) } diff --git a/test/util/util.go b/test/util/util.go index f45a449e5c..ddb0f47e9b 100644 --- a/test/util/util.go +++ b/test/util/util.go @@ -26,6 +26,7 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" + "github.com/prometheus/client_golang/prometheus" zaplog "go.uber.org/zap" "go.uber.org/zap/zapcore" batchv1 "k8s.io/api/batch/v1" @@ -488,37 +489,29 @@ func ExpectReservingActiveWorkloadsMetric(cq *kueue.ClusterQueue, v int) { func ExpectAdmittedWorkloadsTotalMetric(cq *kueue.ClusterQueue, v int) { metric := metrics.AdmittedWorkloadsTotal.WithLabelValues(cq.Name) - gomega.EventuallyWithOffset(1, func(g gomega.Gomega) { - count, err := testutil.GetCounterMetricValue(metric) - g.Expect(err).ToNot(gomega.HaveOccurred()) - g.Expect(int(count)).Should(gomega.Equal(v)) - }, Timeout, Interval).Should(gomega.Succeed()) + expectCounterMetric(metric, v) } func ExpectEvictedWorkloadsTotalMetric(cqName, reason string, v int) { metric := metrics.EvictedWorkloadsTotal.WithLabelValues(cqName, reason) - gomega.EventuallyWithOffset(1, func(g gomega.Gomega) { - count, err := testutil.GetCounterMetricValue(metric) - g.Expect(err).ToNot(gomega.HaveOccurred()) - g.Expect(int(count)).Should(gomega.Equal(v)) - }, Timeout, Interval).Should(gomega.Succeed()) + expectCounterMetric(metric, v) } func ExpectPreemptedWorkloadsTotalMetric(preemptorCqName, reason string, v int) { metric := metrics.PreemptedWorkloadsTotal.WithLabelValues(preemptorCqName, reason) - gomega.EventuallyWithOffset(1, func(g gomega.Gomega) { - count, err := testutil.GetCounterMetricValue(metric) - g.Expect(err).ToNot(gomega.HaveOccurred()) - g.Expect(int(count)).Should(gomega.Equal(v)) - }, Timeout, Interval).Should(gomega.Succeed()) + expectCounterMetric(metric, v) } func ExpectQuotaReservedWorkloadsTotalMetric(cq *kueue.ClusterQueue, v int) { metric := metrics.QuotaReservedWorkloadsTotal.WithLabelValues(cq.Name) + expectCounterMetric(metric, v) +} + +func expectCounterMetric(metric prometheus.Counter, count int) { gomega.EventuallyWithOffset(1, func(g gomega.Gomega) { - count, err := testutil.GetCounterMetricValue(metric) + v, err := testutil.GetCounterMetricValue(metric) g.Expect(err).ToNot(gomega.HaveOccurred()) - g.Expect(int(count)).Should(gomega.Equal(v)) + g.Expect(int(v)).Should(gomega.Equal(count)) }, Timeout, Interval).Should(gomega.Succeed()) }