From 272139690e028d3bdebdb6bcb1824fec23cefd0f Mon Sep 17 00:00:00 2001 From: Yury Akudovich Date: Thu, 31 Oct 2024 14:47:04 +0100 Subject: [PATCH] feat(prover): Add queue metric to report autoscaler view of the queue. (#3206) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What ❔ Add `queue` metric to report autoscaler view of the queue. Add Copy trait for QueueReportFields and remove unneeded clone or references. ## Why ❔ The `queue` metric will be used in dashboards. ## Checklist - [x] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [ ] Tests for the changes have been added / updated. - [ ] Documentation comments have been added / updated. - [x] Code has been formatted via `zkstack dev fmt` and `zkstack dev lint`. ref ZKD-1855 --- core/lib/config/src/configs/prover_autoscaler.rs | 2 +- prover/crates/bin/prover_autoscaler/src/global/queuer.rs | 6 +++--- prover/crates/bin/prover_autoscaler/src/global/scaler.rs | 8 +++++--- prover/crates/bin/prover_autoscaler/src/metrics.rs | 4 +++- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/core/lib/config/src/configs/prover_autoscaler.rs b/core/lib/config/src/configs/prover_autoscaler.rs index ab6b8fdf202f..4191208b96e3 100644 --- a/core/lib/config/src/configs/prover_autoscaler.rs +++ b/core/lib/config/src/configs/prover_autoscaler.rs @@ -100,7 +100,7 @@ pub enum Gpu { // TODO: generate this enum by QueueReport from https://github.com/matter-labs/zksync-era/blob/main/prover/crates/bin/prover_job_monitor/src/autoscaler_queue_reporter.rs#L23 // and remove allowing of non_camel_case_types by generating field name parser. -#[derive(Debug, Display, PartialEq, Eq, Hash, Clone, Deserialize, EnumString, Default)] +#[derive(Debug, Display, PartialEq, Eq, Hash, Clone, Copy, Deserialize, EnumString, Default)] #[allow(non_camel_case_types)] pub enum QueueReportFields { #[strum(ascii_case_insensitive)] diff --git a/prover/crates/bin/prover_autoscaler/src/global/queuer.rs b/prover/crates/bin/prover_autoscaler/src/global/queuer.rs index e2cd1c6a4fb2..7255f479647e 100644 --- a/prover/crates/bin/prover_autoscaler/src/global/queuer.rs +++ b/prover/crates/bin/prover_autoscaler/src/global/queuer.rs @@ -24,7 +24,7 @@ pub struct Queuer { pub prover_job_monitor_url: String, } -fn target_to_queue(target: &QueueReportFields, report: &QueueReport) -> u64 { +fn target_to_queue(target: QueueReportFields, report: &QueueReport) -> u64 { let res = match target { QueueReportFields::basic_witness_jobs => report.basic_witness_jobs.all(), QueueReportFields::leaf_witness_jobs => report.leaf_witness_jobs.all(), @@ -65,8 +65,8 @@ impl Queuer { .flat_map(|versioned_report| { jobs.iter().map(move |j| { ( - (versioned_report.version.to_string(), j.clone()), - target_to_queue(j, &versioned_report.report), + (versioned_report.version.to_string(), *j), + target_to_queue(*j, &versioned_report.report), ) }) }) diff --git a/prover/crates/bin/prover_autoscaler/src/global/scaler.rs b/prover/crates/bin/prover_autoscaler/src/global/scaler.rs index 362fbbac0744..dc652999da5f 100644 --- a/prover/crates/bin/prover_autoscaler/src/global/scaler.rs +++ b/prover/crates/bin/prover_autoscaler/src/global/scaler.rs @@ -124,7 +124,7 @@ impl Scaler { let mut simple_scalers = Vec::default(); let mut jobs = vec![QueueReportFields::prover_jobs]; for c in &config.scaler_targets { - jobs.push(c.queue_report_field.clone()); + jobs.push(c.queue_report_field); simple_scalers.push(SimpleScaler::new( c, config.cluster_priorities.clone(), @@ -429,7 +429,7 @@ impl SimpleScaler { long_pending_duration: chrono::Duration, ) -> Self { Self { - queue_report_field: config.queue_report_field.clone(), + queue_report_field: config.queue_report_field, deployment: config.deployment.clone(), cluster_priorities, max_replicas: config.max_replicas.clone(), @@ -671,6 +671,7 @@ impl Task for Scaler { .get(&(ppv.to_string(), QueueReportFields::prover_jobs)) .cloned() .unwrap_or(0); + AUTOSCALER_METRICS.queue[&(ns.clone(), "prover".into())].set(q); tracing::debug!("Running eval for namespace {ns} and PPV {ppv} found queue {q}"); if q > 0 || is_namespace_running(ns, &guard.clusters) { let provers = self.prover_scaler.run(ns, q, &guard.clusters); @@ -684,9 +685,10 @@ impl Task for Scaler { // Simple Scalers. for scaler in &self.simple_scalers { let q = queue - .get(&(ppv.to_string(), scaler.queue_report_field.clone())) + .get(&(ppv.to_string(), scaler.queue_report_field)) .cloned() .unwrap_or(0); + AUTOSCALER_METRICS.queue[&(ns.clone(), scaler.deployment.clone())].set(q); tracing::debug!("Running eval for namespace {ns}, PPV {ppv}, simple scaler {} found queue {q}", scaler.deployment); if q > 0 || is_namespace_running(ns, &guard.clusters) { let replicas = scaler.run(ns, q, &guard.clusters); diff --git a/prover/crates/bin/prover_autoscaler/src/metrics.rs b/prover/crates/bin/prover_autoscaler/src/metrics.rs index 853e3db000f1..39860a9e8f09 100644 --- a/prover/crates/bin/prover_autoscaler/src/metrics.rs +++ b/prover/crates/bin/prover_autoscaler/src/metrics.rs @@ -16,7 +16,9 @@ pub(crate) struct AutoscalerMetrics { #[metrics(labels = ["target", "status"])] pub calls: LabeledFamily<(String, u16), Counter, 2>, #[metrics(labels = ["target_cluster"])] - pub scale_errors: LabeledFamily, 1>, + pub scale_errors: LabeledFamily>, + #[metrics(labels = ["target_namespace", "job"])] + pub queue: LabeledFamily<(String, String), Gauge, 2>, } #[vise::register]