Skip to content

Commit

Permalink
Merge pull request #162 from lars-t-hansen/w-152-final-cleanup
Browse files Browse the repository at this point in the history
Fix #152 - Bump version, documentation, tidy up
  • Loading branch information
bast committed May 16, 2024
2 parents fc07391 + faffa0a commit 4a41c81
Show file tree
Hide file tree
Showing 29 changed files with 487 additions and 128 deletions.
13 changes: 13 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,20 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Build
run: cargo build --verbose

- name: Test
run: cargo test --verbose

# one of the shell tests below needs jq
- name: Install jq
run: |
sudo apt-get update
sudo apt-get install -y jq
- name: Shell tests
run: |
cd tests
./run_tests.sh
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ edition = "2021"

[dependencies]
libc = "0.2"
subprocess = "0.2"
subprocess = { version = "= 0.2.9" }
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,22 @@ of processes.

Optionally, `sonar` will use a lockfile to avoid a pile-up of processes.

## Dependencies and updates

Sonar runs everywhere and all the time, and even though it currently runs without privileges it
strives to have as few dependencies as possible, so as not to become a target through a supply chain
attack. There are some rules:

- It's OK to depend on libc and to incorporate new versions of libc
- It's better to depend on something from the rust-lang organization than on something else
- Every dependency needs to be justified
- Every dependency must have a compatible license
- Every dependency needs to be vetted as to active development, apparent quality, test cases
- Every dependency update - even for security issues - is to be considered a code change that needs review
- Remember that indirect dependencies are dependencies for us, too, and need to be treated the same way
- If in doubt: copy the parts we need, vet them thoroughly, and maintain them separately

There is a useful discussion of these matters [here](https://research.swtch.com/deps).

## How we run sonar on a cluster

Expand Down
6 changes: 5 additions & 1 deletion src/amd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,11 @@ pub fn get_amd_information(user_by_pid: &UserTable) -> Result<Vec<gpu::Process>,

match command::safe_command(AMD_CONCISE_COMMAND, AMD_CONCISE_ARGS, TIMEOUT_SECONDS) {
Ok(concise_raw_text) => {
match command::safe_command(AMD_SHOWPIDGPUS_COMMAND, AMD_SHOWPIDGPUS_ARGS, TIMEOUT_SECONDS) {
match command::safe_command(
AMD_SHOWPIDGPUS_COMMAND,
AMD_SHOWPIDGPUS_ARGS,
TIMEOUT_SECONDS,
) {
Ok(showpidgpus_raw_text) => Ok(extract_amd_information(
&concise_raw_text,
&showpidgpus_raw_text,
Expand Down
10 changes: 8 additions & 2 deletions src/command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ pub enum CmdError {
// especially at https://github.com/rust-lang/rust/issues/45572#issuecomment-860134955. See also
// https://doc.rust-lang.org/std/process/index.html (second code blob under "Handling I/O").

pub fn safe_command(command: &str, args: &[&str], timeout_seconds: u64) -> Result<String, CmdError> {
pub fn safe_command(
command: &str,
args: &[&str],
timeout_seconds: u64,
) -> Result<String, CmdError> {
let mut p = match Exec::cmd(command)
.args(args)
.stdout(Redirection::Pipe)
Expand Down Expand Up @@ -168,7 +172,9 @@ fn test_safe_command() {
Err(_) => assert!(false),
}
// This really needs to be the output
assert!(safe_command("grep", &["^name =", "Cargo.toml"], 2) == Ok("name = \"sonar\"\n".to_string()));
assert!(
safe_command("grep", &["^name =", "Cargo.toml"], 2) == Ok("name = \"sonar\"\n".to_string())
);
// Not found
match safe_command("no-such-command-we-hope", &[], 2) {
Err(CmdError::CouldNotStart(_)) => {}
Expand Down
13 changes: 4 additions & 9 deletions src/hostname.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,24 +28,20 @@ SOFTWARE.
*/

use std::ffi::OsString;
use std::os::unix::ffi::OsStringExt;
use std::io;
use libc;
use std::os::unix::ffi::OsStringExt;

pub fn get() -> io::Result<OsString> {
// According to the POSIX specification,
// host names are limited to `HOST_NAME_MAX` bytes
//
// https://pubs.opengroup.org/onlinepubs/9699919799/functions/gethostname.html
let size =
unsafe { libc::sysconf(libc::_SC_HOST_NAME_MAX) as libc::size_t };
let size = unsafe { libc::sysconf(libc::_SC_HOST_NAME_MAX) as libc::size_t };

// Stack buffer OK: HOST_NAME_MAX is typically very small (64 on Linux).
let mut buffer = vec![0u8; size];

let result = unsafe {
libc::gethostname(buffer.as_mut_ptr() as *mut libc::c_char, size)
};
let result = unsafe { libc::gethostname(buffer.as_mut_ptr() as *mut libc::c_char, size) };

if result != 0 {
return Err(io::Error::last_os_error());
Expand All @@ -61,9 +57,8 @@ fn wrap_buffer(mut bytes: Vec<u8>) -> OsString {
let end = bytes
.iter()
.position(|&byte| byte == 0x00)
.unwrap_or_else(|| bytes.len());
.unwrap_or(bytes.len());
bytes.resize(end, 0x00);

OsString::from_vec(bytes)
}

14 changes: 9 additions & 5 deletions src/interrupt.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#[cfg(debug_assertions)]
use crate::log;

use std::sync::atomic::{AtomicBool, Ordering};

// Signal handling logic.
Expand All @@ -19,15 +22,15 @@ extern "C" fn sonar_signal_handler(_: libc::c_int) {

pub fn handle_interruptions() {
unsafe {
let nomask : libc::sigset_t = std::mem::zeroed();
let mut action = libc::sigaction {
let nomask: libc::sigset_t = std::mem::zeroed();
let action = libc::sigaction {
sa_sigaction: sonar_signal_handler as usize,
sa_mask: nomask,
sa_flags: 0,
sa_restorer: None,
};
libc::sigaction(libc::SIGTERM, &mut action, std::ptr::null_mut());
libc::sigaction(libc::SIGHUP, &mut action, std::ptr::null_mut());
libc::sigaction(libc::SIGTERM, &action, std::ptr::null_mut());
libc::sigaction(libc::SIGHUP, &action, std::ptr::null_mut());
}
}

Expand All @@ -38,7 +41,8 @@ pub fn is_interrupted() -> bool {
}
let flag = INTERRUPTED.load(Ordering::Relaxed);
if flag {
println!("Interrupt flag was set!")
// Test cases depend on this exact output.
log::info("Interrupt flag was set!")
}
flag
}
Expand Down
3 changes: 2 additions & 1 deletion src/jobs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ pub trait JobManager {
//
// There's an assumption here that the process map is always the same for all lookups
// performed on a particular instance of JobManager.
fn job_id_from_pid(&mut self, pid: usize, processes: &HashMap<usize, procfs::Process>) -> usize;
fn job_id_from_pid(&mut self, pid: usize, processes: &HashMap<usize, procfs::Process>)
-> usize;
}
131 changes: 79 additions & 52 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ mod users;
mod util;

const TIMEOUT_SECONDS: u64 = 5; // For subprocesses
const USAGE_ERROR: i32 = 2; // clap, Python, Go
const USAGE_ERROR: i32 = 2; // clap, Python, Go

enum Commands {
/// Take a snapshot of the currently running processes
Expand Down Expand Up @@ -115,10 +115,12 @@ fn main() {
// - all error reporting is via a generic "usage" message, without specificity as to what was wrong

fn command_line() -> Commands {
let mut args = std::env::args();
let _executable = args.next();
if let Some(command) = args.next() {
match command.as_str() {
let args = std::env::args().collect::<Vec<String>>();
let mut next = 1;
if next < args.len() {
let command = args[next].as_ref();
next += 1;
match command {
"ps" => {
let mut batchless = false;
let mut rollup = false;
Expand All @@ -129,42 +131,43 @@ fn command_line() -> Commands {
let mut exclude_users = None;
let mut exclude_commands = None;
let mut lockdir = None;
loop {
if let Some(arg) = args.next() {
match arg.as_str() {
"--batchless" => {
batchless = true;
}
"--rollup" => {
rollup = true;
}
"--exclude-system-jobs" => {
exclude_system_jobs = true;
}
"--exclude-users" => {
(args, exclude_users) = string_value(args);
}
"--exclude-commands" => {
(args, exclude_commands) = string_value(args);
}
"--lockdir" => {
(args, lockdir) = string_value(args);
}
"--min-cpu-percent" => {
(args, min_cpu_percent) = parsed_value::<f64>(args);
}
"--min-mem-percent" => {
(args, min_mem_percent) = parsed_value::<f64>(args);
}
"--min-cpu-time" => {
(args, min_cpu_time) = parsed_value::<usize>(args);
}
_ => {
usage(true);
}
}
while next < args.len() {
let arg = args[next].as_ref();
next += 1;
if let Some(new_next) = bool_arg(arg, &args, next, "--batchless") {
(next, batchless) = (new_next, true);
} else if let Some(new_next) = bool_arg(arg, &args, next, "--rollup") {
(next, rollup) = (new_next, true);
} else if let Some(new_next) =
bool_arg(arg, &args, next, "--exclude-system-jobs")
{
(next, exclude_system_jobs) = (new_next, true);
} else if let Some((new_next, value)) =
string_arg(arg, &args, next, "--exclude-users")
{
(next, exclude_users) = (new_next, Some(value));
} else if let Some((new_next, value)) =
string_arg(arg, &args, next, "--exclude-commands")
{
(next, exclude_commands) = (new_next, Some(value));
} else if let Some((new_next, value)) =
string_arg(arg, &args, next, "--lockdir")
{
(next, lockdir) = (new_next, Some(value));
} else if let Some((new_next, value)) =
numeric_arg::<f64>(arg, &args, next, "--min-cpu-percent")
{
(next, min_cpu_percent) = (new_next, Some(value));
} else if let Some((new_next, value)) =
numeric_arg::<f64>(arg, &args, next, "--min-mem-percent")
{
(next, min_mem_percent) = (new_next, Some(value));
} else if let Some((new_next, value)) =
numeric_arg::<usize>(arg, &args, next, "--min-cpu-time")
{
(next, min_cpu_time) = (new_next, Some(value));
} else {
break;
usage(true);
}
}

Expand All @@ -178,7 +181,8 @@ fn command_line() -> Commands {
eprintln!("--rollup and --batchless are incompatible");
std::process::exit(USAGE_ERROR);
}
return Commands::PS {

Commands::PS {
batchless,
rollup,
min_cpu_percent,
Expand All @@ -188,9 +192,9 @@ fn command_line() -> Commands {
exclude_users,
exclude_commands,
lockdir,
};
}
}
"sysinfo" => return Commands::Sysinfo {},
"sysinfo" => Commands::Sysinfo {},
"help" => {
usage(false);
}
Expand All @@ -203,24 +207,47 @@ fn command_line() -> Commands {
}
}

fn string_value(mut args: std::env::Args) -> (std::env::Args, Option<String>) {
if let Some(val) = args.next() {
(args, Some(val))
fn bool_arg(arg: &str, _args: &[String], next: usize, opt_name: &str) -> Option<usize> {
if arg == opt_name {
Some(next)
} else {
usage(true);
None
}
}

fn string_arg(arg: &str, args: &[String], next: usize, opt_name: &str) -> Option<(usize, String)> {
if arg == opt_name {
if next < args.len() {
Some((next + 1, args[next].to_string()))
} else {
None
}
} else if let Some((first, rest)) = arg.split_once('=') {
if first == opt_name {
Some((next, rest.to_string()))
} else {
None
}
} else {
None
}
}

fn parsed_value<T: std::str::FromStr>(mut args: std::env::Args) -> (std::env::Args, Option<T>) {
if let Some(val) = args.next() {
match val.parse::<T>() {
Ok(value) => (args, Some(value)),
fn numeric_arg<T: std::str::FromStr>(
arg: &str,
args: &[String],
next: usize,
opt_name: &str,
) -> Option<(usize, T)> {
if let Some((next, strval)) = string_arg(arg, args, next, opt_name) {
match strval.parse::<T>() {
Ok(value) => Some((next, value)),
_ => {
usage(true);
}
}
} else {
usage(true);
None
}
}

Expand Down
6 changes: 4 additions & 2 deletions src/nvidia.rs
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,10 @@ fn parse_pmon_output(raw_text: &str, user_by_pid: &UserTable) -> Result<Vec<gpu:

const NVIDIA_QUERY_COMMAND: &str = "nvidia-smi";

const NVIDIA_QUERY_ARGS: &[&str] =
&["--query-compute-apps=pid,used_memory", "--format=csv,noheader,nounits"];
const NVIDIA_QUERY_ARGS: &[&str] = &[
"--query-compute-apps=pid,used_memory",
"--format=csv,noheader,nounits",
];

// Same signature as extract_nvidia_pmon_processes(), q.v. but user is always "_zombie_<PID>" and
// command is always "_unknown_". Only pids not in user_by_pid are returned.
Expand Down
Loading

0 comments on commit 4a41c81

Please sign in to comment.