From bf05dbfce4cbe04994e9f561ee2ea4ce8fd7d4a0 Mon Sep 17 00:00:00 2001 From: qima Date: Tue, 14 Jan 2025 22:49:38 +0800 Subject: [PATCH 1/2] fix(metrics): avoid intialize un-necessary system resources It turned out that new_all function opens FD for un-necessary system resources, which result in accumulated memory-usage issue. Hence initialize with nothing and then refresh only related. --- Cargo.lock | 55 ++++++++++++++++++++++++------- ant-logging/Cargo.toml | 2 +- ant-logging/src/metrics.rs | 23 +++++++++---- ant-networking/Cargo.toml | 2 +- ant-networking/src/metrics/mod.rs | 14 ++++++-- ant-node-manager/Cargo.toml | 2 +- ant-node/Cargo.toml | 2 +- ant-node/src/bin/antnode/main.rs | 14 ++++---- ant-service-management/Cargo.toml | 2 +- node-launchpad/Cargo.toml | 2 +- 10 files changed, 85 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d9327f6bdf..a967d71edc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9201,17 +9201,16 @@ dependencies = [ [[package]] name = "sysinfo" -version = "0.30.13" +version = "0.33.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3" +checksum = "4fc858248ea01b66f19d8e8a6d55f41deaf91e9d495246fd01368d99935c6c01" dependencies = [ - "cfg-if", "core-foundation-sys", "libc", + "memchr", "ntapi", - "once_cell", "rayon", - "windows 0.52.0", + "windows 0.57.0", ] [[package]] @@ -10541,21 +10540,21 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows" -version = "0.52.0" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +checksum = "efc5cf48f83140dcaab716eeaea345f9e93d0018fb81162753a3f76c3397b538" dependencies = [ - "windows-core 0.52.0", + "windows-core 0.53.0", "windows-targets 0.52.6", ] [[package]] name = "windows" -version = "0.53.0" +version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efc5cf48f83140dcaab716eeaea345f9e93d0018fb81162753a3f76c3397b538" +checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" dependencies = [ - "windows-core 0.53.0", + "windows-core 0.57.0", "windows-targets 0.52.6", ] @@ -10578,6 +10577,40 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-core" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-result 0.1.2", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-implement" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + +[[package]] +name = "windows-interface" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "windows-registry" version = "0.2.0" diff --git a/ant-logging/Cargo.toml b/ant-logging/Cargo.toml index 92394789c6..370341ca76 100644 --- a/ant-logging/Cargo.toml +++ b/ant-logging/Cargo.toml @@ -19,7 +19,7 @@ opentelemetry-semantic-conventions = { version = "0.12.0", optional = true } rand = { version = "~0.8.5", features = ["small_rng"], optional = true } serde = { version = "1.0.133", features = ["derive", "rc"] } serde_json = { version = "1.0" } -sysinfo = { version = "0.30.8", default-features = false, optional = true } +sysinfo = { version = "0.33.1", optional = true } thiserror = "1.0.23" tokio = { version = "1.32.0", optional = true } tracing = { version = "~0.1.26" } diff --git a/ant-logging/src/metrics.rs b/ant-logging/src/metrics.rs index b6d99a6137..9ee23a7ec5 100644 --- a/ant-logging/src/metrics.rs +++ b/ant-logging/src/metrics.rs @@ -8,7 +8,7 @@ use serde::Serialize; use std::time::Duration; -use sysinfo::{self, Networks, Pid, System}; +use sysinfo::{self, Networks, Pid, ProcessRefreshKind, ProcessesToUpdate, System}; use tracing::{debug, error}; const UPDATE_INTERVAL: Duration = Duration::from_secs(15); @@ -44,9 +44,9 @@ struct ProcessMetrics { // Obtains the system metrics every UPDATE_INTERVAL and logs it. // The function should be spawned as a task and should be re-run if our main process is restarted. pub async fn init_metrics(pid: u32) { - let mut sys = System::new_all(); let mut networks = Networks::new_with_refreshed_list(); let pid = Pid::from_u32(pid); + let mut sys = System::new(); loop { refresh_metrics(&mut sys, &mut networks, pid); @@ -70,10 +70,10 @@ pub async fn init_metrics(pid: u32) { } }; - let cpu_stat = sys.global_cpu_info(); + let system_cpu_usage_percent = sys.global_cpu_usage(); let metrics = Metrics { physical_cpu_threads: sys.cpus().len(), - system_cpu_usage_percent: cpu_stat.cpu_usage(), + system_cpu_usage_percent, process, }; match serde_json::to_string(&metrics) { @@ -87,8 +87,17 @@ pub async fn init_metrics(pid: u32) { // Refreshes only the metrics that we interested in. fn refresh_metrics(sys: &mut System, networks: &mut Networks, pid: Pid) { - sys.refresh_process(pid); sys.refresh_memory(); - sys.refresh_cpu(); - networks.refresh(); + sys.refresh_cpu_usage(); + networks.refresh(true); + + // To refresh only the specific process: + sys.refresh_processes_specifics( + ProcessesToUpdate::Some(&[pid]), + true, + ProcessRefreshKind::nothing() + .with_cpu() + .with_disk_usage() + .with_memory(), + ); } diff --git a/ant-networking/Cargo.toml b/ant-networking/Cargo.toml index c1b02c82df..e02dd288dd 100644 --- a/ant-networking/Cargo.toml +++ b/ant-networking/Cargo.toml @@ -62,7 +62,7 @@ self_encryption = "~0.30.0" serde = { version = "1.0.133", features = ["derive", "rc"] } sha2 = "0.10" strum = { version = "0.26.2", features = ["derive"] } -sysinfo = { version = "0.30.8", default-features = false, optional = true } +sysinfo = { version = "0.33.1", optional = true } thiserror = "1.0.23" tiny-keccak = { version = "~2.0.2", features = ["sha3"] } tokio = { version = "1.32.0", features = [ diff --git a/ant-networking/src/metrics/mod.rs b/ant-networking/src/metrics/mod.rs index 2cc80fe424..cafd0666f2 100644 --- a/ant-networking/src/metrics/mod.rs +++ b/ant-networking/src/metrics/mod.rs @@ -25,7 +25,7 @@ use prometheus_client::{ metrics::family::Family, metrics::{counter::Counter, gauge::Gauge}, }; -use sysinfo::{Pid, ProcessRefreshKind, System}; +use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System}; use tokio::time::Duration; const UPDATE_INTERVAL: Duration = Duration::from_secs(15); @@ -246,12 +246,19 @@ impl NetworkMetricsRecorder { let pid = Pid::from_u32(std::process::id()); let process_refresh_kind = ProcessRefreshKind::everything().without_disk_usage(); - let mut system = System::new_all(); + + let mut system = System::new(); let physical_core_count = system.physical_core_count(); + info!("Detected physical_core_count {physical_core_count:?}"); tokio::spawn(async move { loop { - system.refresh_process_specifics(pid, process_refresh_kind); + system.refresh_processes_specifics( + ProcessesToUpdate::Some(&[pid]), + true, + process_refresh_kind, + ); + if let (Some(process), Some(core_count)) = (system.process(pid), physical_core_count) { @@ -264,6 +271,7 @@ impl NetworkMetricsRecorder { / 10000.0; let _ = process_cpu_usage_percentage.set(cpu_usage); } + sleep(UPDATE_INTERVAL).await; } }); diff --git a/ant-node-manager/Cargo.toml b/ant-node-manager/Cargo.toml index d9831185a8..b352d5b4b1 100644 --- a/ant-node-manager/Cargo.toml +++ b/ant-node-manager/Cargo.toml @@ -51,7 +51,7 @@ semver = "1.0.20" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" service-manager = "0.7.0" -sysinfo = "0.30.12" +sysinfo = "0.33.1" thiserror = "1.0.23" tokio = { version = "1.26", features = ["full"] } tracing = { version = "~0.1.26" } diff --git a/ant-node/Cargo.toml b/ant-node/Cargo.toml index f8e289a30b..2b213b4489 100644 --- a/ant-node/Cargo.toml +++ b/ant-node/Cargo.toml @@ -62,7 +62,7 @@ rayon = "1.8.0" self_encryption = "~0.30.0" serde = { version = "1.0.133", features = ["derive", "rc"] } strum = { version = "0.26.2", features = ["derive"] } -sysinfo = { version = "0.30.8", default-features = false } +sysinfo = "0.33.1" thiserror = "1.0.23" tokio = { version = "1.32.0", features = [ "io-util", diff --git a/ant-node/src/bin/antnode/main.rs b/ant-node/src/bin/antnode/main.rs index 3397d81461..7cc8d859db 100644 --- a/ant-node/src/bin/antnode/main.rs +++ b/ant-node/src/bin/antnode/main.rs @@ -413,7 +413,7 @@ You can check your reward balance by running: } }); let ctrl_tx_clone_cpu = ctrl_tx.clone(); - // Monitor host CPU usage + // Monitor Host CPU usage tokio::spawn(async move { use rand::{thread_rng, Rng}; @@ -427,18 +427,21 @@ You can check your reward balance by running: const JITTER_MIN_S: u64 = 1; const JITTER_MAX_S: u64 = 15; - let mut sys = System::new_all(); - let mut high_cpu_count: u8 = 0; + let mut system = System::new(); + system.refresh_cpu_usage(); + // Random initial delay between 1 and 5 minutes let initial_delay = Duration::from_secs(thread_rng().gen_range(INITIAL_DELAY_MIN_S..=INITIAL_DELAY_MAX_S)); tokio::time::sleep(initial_delay).await; loop { - sys.refresh_cpu(); - let cpu_usage = sys.global_cpu_info().cpu_usage(); + system.refresh_cpu_usage(); + + let cpu_usage = system.global_cpu_usage(); + info!("Detected Host CPU usage is {cpu_usage:?}."); if cpu_usage > CPU_USAGE_THRESHOLD { high_cpu_count += 1; @@ -458,7 +461,6 @@ You can check your reward balance by running: } break; } - // Add jitter to the interval let jitter = Duration::from_secs(thread_rng().gen_range(JITTER_MIN_S..=JITTER_MAX_S)); tokio::time::sleep(CPU_CHECK_INTERVAL + jitter).await; diff --git a/ant-service-management/Cargo.toml b/ant-service-management/Cargo.toml index acc2fe7d36..2ec8da4884 100644 --- a/ant-service-management/Cargo.toml +++ b/ant-service-management/Cargo.toml @@ -23,7 +23,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" semver = "1.0.20" service-manager = "0.7.0" -sysinfo = "0.30.12" +sysinfo = "0.33.1" thiserror = "1.0.23" tokio = { version = "1.32.0", features = ["time"] } tonic = { version = "0.6.2" } diff --git a/node-launchpad/Cargo.toml b/node-launchpad/Cargo.toml index d71003f3dc..9fb92ac398 100644 --- a/node-launchpad/Cargo.toml +++ b/node-launchpad/Cargo.toml @@ -63,7 +63,7 @@ serde_json = "1.0.107" signal-hook = "0.3.17" strip-ansi-escapes = "0.2.0" strum = { version = "0.26.1", features = ["derive"] } -sysinfo = "0.30.12" +sysinfo = "0.33.1" throbber-widgets-tui = "0.8.0" tokio = { version = "1.32.0", features = ["full"] } tokio-util = "0.7.9" From 146651d49fb16c2f1495ffd4cab1da89f76c40ca Mon Sep 17 00:00:00 2001 From: qima Date: Tue, 14 Jan 2025 22:57:06 +0800 Subject: [PATCH 2/2] test(CI): confirm not open too many FDs --- .github/workflows/memcheck.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/memcheck.yml b/.github/workflows/memcheck.yml index f0efbefda5..89350fc49f 100644 --- a/.github/workflows/memcheck.yml +++ b/.github/workflows/memcheck.yml @@ -164,6 +164,22 @@ jobs: run: pgrep antnode | wc -l if: always() + - name: confirm opened FDs + shell: bash + timeout-minutes: 1 + run: | + fd_cap="30" + pids=$(pgrep antnode) + for pid in $pids; do + fd_count=$(ls /proc/$pid/fd | wc -l) + echo "Process $pid - File Descriptors: $fd_count" + if (( $(echo "$fd_count > $fd_cap" | bc -l) )); then + echo "Process $pid holding FD exceeded threshold: $fd_cap" + exit 1 + fi + done + if: always() + - name: Stop the local network and upload logs if: always() uses: maidsafe/ant-local-testnet-action@main