Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(metrics): avoid intialize un-necessary system resources #2624

Open
wants to merge 2 commits into
base: stable
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .github/workflows/memcheck.yml
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,22 @@ jobs:
run: pgrep antnode | wc -l
if: always()

- name: confirm opened FDs
shell: bash
timeout-minutes: 1
run: |
fd_cap="30"
pids=$(pgrep antnode)
for pid in $pids; do
fd_count=$(ls /proc/$pid/fd | wc -l)
echo "Process $pid - File Descriptors: $fd_count"
if (( $(echo "$fd_count > $fd_cap" | bc -l) )); then
echo "Process $pid holding FD exceeded threshold: $fd_cap"
exit 1
fi
done
if: always()

- name: Stop the local network and upload logs
if: always()
uses: maidsafe/ant-local-testnet-action@main
Expand Down
55 changes: 44 additions & 11 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion ant-logging/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ opentelemetry-semantic-conventions = { version = "0.12.0", optional = true }
rand = { version = "~0.8.5", features = ["small_rng"], optional = true }
serde = { version = "1.0.133", features = ["derive", "rc"] }
serde_json = { version = "1.0" }
sysinfo = { version = "0.30.8", default-features = false, optional = true }
sysinfo = { version = "0.33.1", optional = true }
thiserror = "1.0.23"
tokio = { version = "1.32.0", optional = true }
tracing = { version = "~0.1.26" }
Expand Down
23 changes: 16 additions & 7 deletions ant-logging/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

use serde::Serialize;
use std::time::Duration;
use sysinfo::{self, Networks, Pid, System};
use sysinfo::{self, Networks, Pid, ProcessRefreshKind, ProcessesToUpdate, System};
use tracing::{debug, error};

const UPDATE_INTERVAL: Duration = Duration::from_secs(15);
Expand Down Expand Up @@ -44,9 +44,9 @@ struct ProcessMetrics {
// Obtains the system metrics every UPDATE_INTERVAL and logs it.
// The function should be spawned as a task and should be re-run if our main process is restarted.
pub async fn init_metrics(pid: u32) {
let mut sys = System::new_all();
let mut networks = Networks::new_with_refreshed_list();
let pid = Pid::from_u32(pid);
let mut sys = System::new();

loop {
refresh_metrics(&mut sys, &mut networks, pid);
Expand All @@ -70,10 +70,10 @@ pub async fn init_metrics(pid: u32) {
}
};

let cpu_stat = sys.global_cpu_info();
let system_cpu_usage_percent = sys.global_cpu_usage();
let metrics = Metrics {
physical_cpu_threads: sys.cpus().len(),
system_cpu_usage_percent: cpu_stat.cpu_usage(),
system_cpu_usage_percent,
process,
};
match serde_json::to_string(&metrics) {
Expand All @@ -87,8 +87,17 @@ pub async fn init_metrics(pid: u32) {

// Refreshes only the metrics that we interested in.
fn refresh_metrics(sys: &mut System, networks: &mut Networks, pid: Pid) {
sys.refresh_process(pid);
sys.refresh_memory();
sys.refresh_cpu();
networks.refresh();
sys.refresh_cpu_usage();
networks.refresh(true);

// To refresh only the specific process:
sys.refresh_processes_specifics(
ProcessesToUpdate::Some(&[pid]),
true,
ProcessRefreshKind::nothing()
.with_cpu()
.with_disk_usage()
.with_memory(),
);
}
2 changes: 1 addition & 1 deletion ant-networking/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ self_encryption = "~0.30.0"
serde = { version = "1.0.133", features = ["derive", "rc"] }
sha2 = "0.10"
strum = { version = "0.26.2", features = ["derive"] }
sysinfo = { version = "0.30.8", default-features = false, optional = true }
sysinfo = { version = "0.33.1", optional = true }
thiserror = "1.0.23"
tiny-keccak = { version = "~2.0.2", features = ["sha3"] }
tokio = { version = "1.32.0", features = [
Expand Down
14 changes: 11 additions & 3 deletions ant-networking/src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use prometheus_client::{
metrics::family::Family,
metrics::{counter::Counter, gauge::Gauge},
};
use sysinfo::{Pid, ProcessRefreshKind, System};
use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
use tokio::time::Duration;

const UPDATE_INTERVAL: Duration = Duration::from_secs(15);
Expand Down Expand Up @@ -246,12 +246,19 @@ impl NetworkMetricsRecorder {

let pid = Pid::from_u32(std::process::id());
let process_refresh_kind = ProcessRefreshKind::everything().without_disk_usage();
let mut system = System::new_all();

let mut system = System::new();
let physical_core_count = system.physical_core_count();
info!("Detected physical_core_count {physical_core_count:?}");

tokio::spawn(async move {
loop {
system.refresh_process_specifics(pid, process_refresh_kind);
system.refresh_processes_specifics(
ProcessesToUpdate::Some(&[pid]),
true,
process_refresh_kind,
);

if let (Some(process), Some(core_count)) =
(system.process(pid), physical_core_count)
{
Expand All @@ -264,6 +271,7 @@ impl NetworkMetricsRecorder {
/ 10000.0;
let _ = process_cpu_usage_percentage.set(cpu_usage);
}

sleep(UPDATE_INTERVAL).await;
}
});
Expand Down
2 changes: 1 addition & 1 deletion ant-node-manager/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ semver = "1.0.20"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
service-manager = "0.7.0"
sysinfo = "0.30.12"
sysinfo = "0.33.1"
thiserror = "1.0.23"
tokio = { version = "1.26", features = ["full"] }
tracing = { version = "~0.1.26" }
Expand Down
2 changes: 1 addition & 1 deletion ant-node/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ rayon = "1.8.0"
self_encryption = "~0.30.0"
serde = { version = "1.0.133", features = ["derive", "rc"] }
strum = { version = "0.26.2", features = ["derive"] }
sysinfo = { version = "0.30.8", default-features = false }
sysinfo = "0.33.1"
thiserror = "1.0.23"
tokio = { version = "1.32.0", features = [
"io-util",
Expand Down
14 changes: 8 additions & 6 deletions ant-node/src/bin/antnode/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ You can check your reward balance by running:
}
});
let ctrl_tx_clone_cpu = ctrl_tx.clone();
// Monitor host CPU usage
// Monitor Host CPU usage
tokio::spawn(async move {
use rand::{thread_rng, Rng};

Expand All @@ -427,18 +427,21 @@ You can check your reward balance by running:
const JITTER_MIN_S: u64 = 1;
const JITTER_MAX_S: u64 = 15;

let mut sys = System::new_all();

let mut high_cpu_count: u8 = 0;

let mut system = System::new();
system.refresh_cpu_usage();

// Random initial delay between 1 and 5 minutes
let initial_delay =
Duration::from_secs(thread_rng().gen_range(INITIAL_DELAY_MIN_S..=INITIAL_DELAY_MAX_S));
tokio::time::sleep(initial_delay).await;

loop {
sys.refresh_cpu();
let cpu_usage = sys.global_cpu_info().cpu_usage();
system.refresh_cpu_usage();

let cpu_usage = system.global_cpu_usage();
info!("Detected Host CPU usage is {cpu_usage:?}.");

if cpu_usage > CPU_USAGE_THRESHOLD {
high_cpu_count += 1;
Expand All @@ -458,7 +461,6 @@ You can check your reward balance by running:
}
break;
}

// Add jitter to the interval
let jitter = Duration::from_secs(thread_rng().gen_range(JITTER_MIN_S..=JITTER_MAX_S));
tokio::time::sleep(CPU_CHECK_INTERVAL + jitter).await;
Expand Down
2 changes: 1 addition & 1 deletion ant-service-management/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
semver = "1.0.20"
service-manager = "0.7.0"
sysinfo = "0.30.12"
sysinfo = "0.33.1"
thiserror = "1.0.23"
tokio = { version = "1.32.0", features = ["time"] }
tonic = { version = "0.6.2" }
Expand Down
2 changes: 1 addition & 1 deletion node-launchpad/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ serde_json = "1.0.107"
signal-hook = "0.3.17"
strip-ansi-escapes = "0.2.0"
strum = { version = "0.26.1", features = ["derive"] }
sysinfo = "0.30.12"
sysinfo = "0.33.1"
throbber-widgets-tui = "0.8.0"
tokio = { version = "1.32.0", features = ["full"] }
tokio-util = "0.7.9"
Expand Down
Loading