Skip to content

Commit

Permalink
Add a metric that tracks the number of preemptions issued by a Cluste…
Browse files Browse the repository at this point in the history
…rQueue

* Metric for preempted workloads
  • Loading branch information
vladikkuzn committed Jul 5, 2024
1 parent 81ad017 commit 41b04ed
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 0 deletions.
19 changes: 19 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,19 @@ The label 'reason' can have the following values:
}, []string{"cluster_queue", "reason"},
)

PreemptedWorkloadsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: constants.KueueName,
Name: "preempted_workloads_total",
Help: `The number of preempted workloads per 'preemptor_cluster_queue',
The label 'reason' can have the following values:
- "preeemption.InClusterQueueReason"
- "preemption.InCohortReclamationReason"
- "preemption.InCohortFairSharingReason"
- "preemption.InCohortReclaimWhileBorrowingReason"`,
}, []string{"preemptor_cluster_queue", "reason"},
)

// Metrics tied to the cache.

ReservingActiveWorkloads = prometheus.NewGaugeVec(
Expand Down Expand Up @@ -262,6 +275,10 @@ func ReportEvictedWorkloads(cqName, reason string) {
EvictedWorkloadsTotal.WithLabelValues(cqName, reason).Inc()
}

func ReportPreemptedWorkloads(preemptorCqName, reason string) {
PreemptedWorkloadsTotal.WithLabelValues(preemptorCqName, reason).Inc()
}

func ClearQueueSystemMetrics(cqName string) {
PendingWorkloads.DeleteLabelValues(cqName, PendingStatusActive)
PendingWorkloads.DeleteLabelValues(cqName, PendingStatusInadmissible)
Expand All @@ -271,6 +288,7 @@ func ClearQueueSystemMetrics(cqName string) {
admissionWaitTime.DeleteLabelValues(cqName)
admissionChecksWaitTime.DeleteLabelValues(cqName)
EvictedWorkloadsTotal.DeletePartialMatch(prometheus.Labels{"cluster_queue": cqName})
PreemptedWorkloadsTotal.DeletePartialMatch(prometheus.Labels{"preemptor_cluster_queue": cqName})
}

func ReportClusterQueueStatus(cqName string, cqStatus ClusterQueueStatus) {
Expand Down Expand Up @@ -378,6 +396,7 @@ func Register() {
quotaReservedWaitTime,
AdmittedWorkloadsTotal,
EvictedWorkloadsTotal,
PreemptedWorkloadsTotal,
admissionWaitTime,
admissionChecksWaitTime,
ClusterQueueResourceUsage,
Expand Down
22 changes: 22 additions & 0 deletions pkg/scheduler/preemption/preemption.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,26 @@ func canBorrowWithinCohort(cq *cache.ClusterQueue, wl *kueue.Workload) (bool, *i
return true, &threshold
}

// Preemption origins
const (
// ClusterQueueOrigin indicates that preemption originated from cluster queue
ClusterQueueOrigin = "ClusterQueue"
// CohortOrigin indicates that preemption originated from cohort
CohortOrigin = "Cohort"
)

// Reasons of ClusterQueueOrigin
const (
InClusterQueueReason = "InClusterQueue"
)

// Reasons of CohortOrigin
const (
InCohortReclamationReason = "InCohortReclamation"
InCohortFairSharingReason = "InCohortFairSharing"
InCohortReclaimWhileBorrowingReason = "InCohortReclaimWhileBorrowing"
)

// IssuePreemptions marks the target workloads as evicted.
func (p *Preemptor) IssuePreemptions(ctx context.Context, preemptor *workload.Info, targets []*workload.Info, cq *cache.ClusterQueue) (int, error) {
log := ctrl.LoggerFrom(ctx)
Expand All @@ -187,6 +207,8 @@ func (p *Preemptor) IssuePreemptions(ctx context.Context, preemptor *workload.In
}

log.V(3).Info("Preempted", "targetWorkload", klog.KObj(target.Obj), "reason", reason, "message", message)
metrics.ReportPreemptedWorkloads(preemptor.ClusterQueue, reason)

p.recorder.Eventf(target.Obj, corev1.EventTypeNormal, "Preempted", message)
metrics.ReportEvictedWorkloads(target.ClusterQueue, kueue.WorkloadEvictedByPreemption)
} else {
Expand Down

0 comments on commit 41b04ed

Please sign in to comment.