Skip to content

Commit

Permalink
chore(node): restore to check host CPU usage
Browse files Browse the repository at this point in the history
  • Loading branch information
maqi committed Jan 14, 2025
1 parent dfd692f commit 0d1c4c4
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 54 deletions.
10 changes: 5 additions & 5 deletions ant-logging/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct Metrics {
process: Option<ProcessMetrics>,
}

#[derive(Clone, Debug, Default, Serialize)]
#[derive(Debug, Serialize)]
struct ProcessMetrics {
// Percentage of CPU used by the process
cpu_usage_percent: f32,
Expand Down Expand Up @@ -73,10 +73,10 @@ pub async fn init_metrics(pid: u32) {
}
};

// let system_cpu_usage_percent = sys.global_cpu_usage();
let system_cpu_usage_percent = sys.global_cpu_usage();
let metrics = Metrics {
physical_cpu_threads: sys.cpus().len(),
system_cpu_usage_percent: process.clone().unwrap_or_default().cpu_usage_percent,
system_cpu_usage_percent,
process,
};
match serde_json::to_string(&metrics) {
Expand All @@ -91,8 +91,8 @@ pub async fn init_metrics(pid: u32) {

// Refreshes only the metrics that we interested in.
fn refresh_metrics(sys: &mut System, networks: &mut Networks, pid: Pid) {
// sys.refresh_memory();
// sys.refresh_cpu_usage();
sys.refresh_memory();
sys.refresh_cpu_usage();
networks.refresh(true);

// To refresh only the specific process:
Expand Down
80 changes: 31 additions & 49 deletions ant-node/src/bin/antnode/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ use std::{
process::Command,
time::Duration,
};
use sysinfo::{self, Pid, ProcessRefreshKind, ProcessesToUpdate, System};
use sysinfo::{self, System};
use tokio::{
runtime::Runtime,
sync::{broadcast::error::RecvError, mpsc},
Expand Down Expand Up @@ -413,14 +413,12 @@ You can check your reward balance by running:
}
});
let ctrl_tx_clone_cpu = ctrl_tx.clone();
let pid = Pid::from_u32(std::process::id());
// Monitor process CPU usage
// Monitor Host CPU usage
tokio::spawn(async move {
use rand::{thread_rng, Rng};

const CPU_CHECK_INTERVAL: Duration = Duration::from_secs(60);
// As this is now a per-process CPU usage, it shall never exceeds 10% for too long time.
const CPU_USAGE_THRESHOLD: f64 = 10.0;
const CPU_USAGE_THRESHOLD: f32 = 50.0;
const HIGH_CPU_CONSECUTIVE_LIMIT: u8 = 5;
const NODE_STOP_DELAY: Duration = Duration::from_secs(1);
const INITIAL_DELAY_MIN_S: u64 = 10;
Expand All @@ -431,10 +429,6 @@ You can check your reward balance by running:

let mut high_cpu_count: u8 = 0;

let process_refresh_kind = ProcessRefreshKind::everything()
.without_disk_usage()
.without_memory();

// Random initial delay between 1 and 5 minutes
let initial_delay =
Duration::from_secs(thread_rng().gen_range(INITIAL_DELAY_MIN_S..=INITIAL_DELAY_MAX_S));
Expand All @@ -443,49 +437,37 @@ You can check your reward balance by running:
loop {
{
let mut system = System::new_all();
let physical_core_count = system.physical_core_count();
system.refresh_processes_specifics(
ProcessesToUpdate::Some(&[pid]),
true,
process_refresh_kind,
);
system.refresh_cpu_usage();
tokio::time::sleep(Duration::from_millis(300)).await;
system.refresh_processes_specifics(
ProcessesToUpdate::Some(&[pid]),
true,
process_refresh_kind,
);
if let (Some(process), Some(core_count)) =
(system.process(pid), physical_core_count)
{
// divide by core_count to get value between 0-100
let cpu_usage = ((process.cpu_usage() as f64 / core_count as f64) * 10000.0)
.round()
/ 10000.0;
info!(
"Detected process {pid} CPU usage is {cpu_usage:?} (with {core_count} cores)"
);

if cpu_usage > CPU_USAGE_THRESHOLD {
high_cpu_count += 1;
} else {
high_cpu_count = 0;
}
system.refresh_cpu_usage();

if high_cpu_count >= HIGH_CPU_CONSECUTIVE_LIMIT {
if let Err(err) = ctrl_tx_clone_cpu
.send(NodeCtrl::Stop {
delay: NODE_STOP_DELAY,
result: StopResult::Success(format!("Excess host CPU %{CPU_USAGE_THRESHOLD} detected for {HIGH_CPU_CONSECUTIVE_LIMIT} consecutive minutes!")),
})
.await
{
error!("Failed to send node control msg to antnode bin main thread: {err}");
}
break;
}
let cpu_usage = system.global_cpu_usage();

if cpu_usage > CPU_USAGE_THRESHOLD {
high_cpu_count += 1;
} else {
high_cpu_count = 0;
}

info!("Detected Host CPU usage is {cpu_usage:?}.");

if cpu_usage > CPU_USAGE_THRESHOLD {
high_cpu_count += 1;
} else {
error!("Cann't refresh systeminfo of process {pid} with OS core_count of {physical_core_count:?}");
high_cpu_count = 0;
}

if high_cpu_count >= HIGH_CPU_CONSECUTIVE_LIMIT {
if let Err(err) = ctrl_tx_clone_cpu
.send(NodeCtrl::Stop {
delay: NODE_STOP_DELAY,
result: StopResult::Success(format!("Excess host CPU %{CPU_USAGE_THRESHOLD} detected for {HIGH_CPU_CONSECUTIVE_LIMIT} consecutive minutes!")),
})
.await
{
error!("Failed to send node control msg to antnode bin main thread: {err}");
}
break;
}
}
// Add jitter to the interval
Expand Down

0 comments on commit 0d1c4c4

Please sign in to comment.