From f0685c4e3d73d0b12ac1f39d3a0540688f035740 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Mon, 29 May 2023 00:54:28 -0700 Subject: [PATCH] client: avoid overscheduling CPUs in presence of MT jobs In 20ff585 we changed the sched policy so that e.g. if there are 2 4-CPU jobs on a 6-CPU host, it runs them both. I.e. overscheduling the CPUs is better than starving them. This commit refines this a bit: if in addition to the MT jobs there are some 1-CPU jobs, it runs one MT job and two of the 1-CPU jobs. Also: show resource usage in cpu_sched_debug messages Also: if CPUs are starved, trigger a work request. This logic was mistakenly hidden in an if (log_flags.cpu_sched_debug) Also: don't ignore log flags in the simulator --- client/cpu_sched.cpp | 91 +++++++++++++++++++++++++------------------- client/result.h | 13 +++++++ client/rr_sim.cpp | 17 +-------- client/sim.cpp | 2 +- client/sim_util.cpp | 2 +- 5 files changed, 69 insertions(+), 56 deletions(-) diff --git a/client/cpu_sched.cpp b/client/cpu_sched.cpp index 3e0c8973e27..dafcb053312 100644 --- a/client/cpu_sched.cpp +++ b/client/cpu_sched.cpp @@ -1087,11 +1087,14 @@ static inline bool more_important(RESULT* r0, RESULT* r1) { } static void print_job_list(vector& jobs) { + char buf[256]; for (unsigned int i=0; irsc_string(buf, 256); msg_printf(rp->project, MSG_INFO, - "[cpu_sched_debug] %d: %s (MD: %s; UTS: %s)", + "[cpu_sched_debug] %d: %s (%s; MD: %s; UTS: %s)", i, rp->name, + buf, rp->edf_scheduled?"yes":"no", rp->unfinished_time_slice?"yes":"no" ); @@ -1122,18 +1125,21 @@ void CLIENT_STATE::append_unfinished_time_slice(vector &run_list) { // Enforce the CPU schedule. // Inputs: -// ordered_scheduled_results -// List of tasks that should (ideally) run, set by schedule_cpus(). -// Most important tasks (e.g. early deadline) are first. -// The set of tasks that actually run may be different: -// - if a task hasn't checkpointed recently we avoid preempting it -// - we don't run tasks that would exceed working-set limits -// Details: -// Initially, each task's scheduler_state is PREEMPTED or SCHEDULED -// depending on whether or not it is running. -// This function sets each task's next_scheduler_state, -// and at the end it starts/resumes and preempts tasks -// based on scheduler_state and next_scheduler_state. +// run_list: list of runnable jobs, ordered by decreasing project priority +// (created by make_run_list()) +// Doesn't include all jobs, but enough to fill CPUs even in MT scenarios. +// +// - append running jobs that haven't finished their time slice +// - order the list by "important" (which includes various factor) +// - then scan the list and run jobs +// - until we've used all resources +// - skip jobs that would exceed mem limits +// +// Initially, each task's scheduler_state is PREEMPTED or SCHEDULED +// depending on whether or not it is running. +// This function sets each task's next_scheduler_state, +// and at the end it starts/resumes and preempts tasks +// based on scheduler_state and next_scheduler_state. // bool CLIENT_STATE::enforce_run_list(vector& run_list) { unsigned int i; @@ -1175,11 +1181,11 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { rp->unfinished_time_slice = false; } - // append running jobs not done with time slice to the to-run list + // add running jobs not done with time slice to the run list // append_unfinished_time_slice(run_list); - // sort to-run list by decreasing importance + // sort run list by decreasing importance // std::sort( run_list.begin(), @@ -1207,7 +1213,6 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { } // schedule non-CPU-intensive tasks, - // and look for backed-off GPU jobs // for (i=0; i& run_list) { // and prune those that can't be assigned // assign_coprocs(run_list); - //bool scheduled_mt = false; - // prune jobs that don't fit in RAM or that exceed CPU usage limits. - // Mark the rest as SCHEDULED + // keep track of the number of remaining single-CPU jobs + // + int n_single_cpu_jobs = 0; + for (RESULT* rp: run_list) { + if (!rp->uses_coprocs() && rp->avp->avg_ncpus==1) { + n_single_cpu_jobs++; + } + } + + // scan the run list // for (i=0; i& run_list) { atp = lookup_active_task_by_result(rp); - // if we're already using all the CPUs, - // don't allow additional CPU jobs; + // if we're already using all the CPUs, don't allow additional CPU jobs; // allow coproc jobs if the resulting CPU load is at most ncpus+1 // if (ncpus_used >= n_usable_cpus) { @@ -1280,13 +1291,12 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { } } -#if 0 - // Don't overcommit CPUs by > 1 if a MT job is scheduled. - // Skip this check for coproc jobs. + // Don't overcommit CPUs by > 1 unless needed to avoid starvation // if (!rp->uses_coprocs() - && (scheduled_mt || (rp->avp->avg_ncpus > 1)) - && (ncpus_used + rp->avp->avg_ncpus > ncpus + 1) + && (rp->avp->avg_ncpus > 1) + && (ncpus_used + rp->avp->avg_ncpus > n_usable_cpus + 1) + && (ncpus_used + n_single_cpu_jobs >= n_usable_cpus) ) { if (log_flags.cpu_sched_debug) { msg_printf(rp->project, MSG_INFO, @@ -1296,7 +1306,6 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { } continue; } -#endif // skip jobs whose working set is too large to fit in available RAM // @@ -1316,22 +1325,23 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { } if (log_flags.cpu_sched_debug || log_flags.mem_usage_debug) { msg_printf(rp->project, MSG_INFO, - "[cpu_sched_debug] enforce: task %s can't run, too big %.2fMB > %.2fMB", + "[cpu_sched_debug] can't run %s: WS too big %.2fMB > %.2fMB", rp->name, wss/MEGA, ram_left/MEGA ); } continue; } + // We've decided to run this job + // if (log_flags.cpu_sched_debug) { msg_printf(rp->project, MSG_INFO, "[cpu_sched_debug] scheduling %s%s", - rp->name, - rp->edf_scheduled?" (high priority)":"" + rp->name, rp->edf_scheduled?" (high priority)":"" ); } - // We've decided to run this job; create an ACTIVE_TASK if needed. + // create an ACTIVE_TASK if needed. // if (!atp) { atp = get_task(rp); @@ -1343,11 +1353,10 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { continue; } -#if 0 - if (rp->avp->avg_ncpus > 1) { - scheduled_mt = true; + if (!rp->uses_coprocs() && rp->avp->avg_ncpus == 1) { + n_single_cpu_jobs--; } -#endif + ncpus_used += rp->avp->avg_ncpus; atp->next_scheduler_state = CPU_SCHED_SCHEDULED; ram_left -= wss; @@ -1356,13 +1365,17 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { } } - if (log_flags.cpu_sched_debug && ncpus_used < n_usable_cpus) { - msg_printf(0, MSG_INFO, "[cpu_sched_debug] using %.2f out of %d CPUs", - ncpus_used, n_usable_cpus - ); + // if CPUs are starved, ask for more jobs + // + if (ncpus_used < n_usable_cpus) { if (ncpus_used < n_usable_cpus) { request_work_fetch("CPUs idle"); } + if (log_flags.cpu_sched_debug) { + msg_printf(0, MSG_INFO, "[cpu_sched_debug] using only %.2f out of %d CPUs", + ncpus_used, n_usable_cpus + ); + } } bool check_swap = (host_info.m_swap != 0); diff --git a/client/result.h b/client/result.h index 38cf2a8a1a0..632394b35fd 100644 --- a/client/result.h +++ b/client/result.h @@ -165,6 +165,19 @@ struct RESULT { if (avp->dont_throttle) return true; return false; } + // make a string describing resource usage + inline void rsc_string(char* buf, int len) { + if (avp->gpu_usage.rsc_type) { + snprintf(buf, len, + "%.2f CPU + %.2f %s", + avp->avg_ncpus, avp->gpu_usage.usage, + rsc_name_long(avp->gpu_usage.rsc_type) + ); + } else { + snprintf(buf, len, "%.2f CPU", avp->avg_ncpus); + } + } + // temporaries used in CLIENT_STATE::rr_simulation(): double rrsim_flops_left; diff --git a/client/rr_sim.cpp b/client/rr_sim.cpp index 55b37ad3ea2..a7fe512cf13 100644 --- a/client/rr_sim.cpp +++ b/client/rr_sim.cpp @@ -52,19 +52,6 @@ using std::vector; -inline void rsc_string(RESULT* rp, char* buf, int len) { - APP_VERSION* avp = rp->avp; - if (avp->gpu_usage.rsc_type) { - snprintf(buf, len, - "%.2f CPU + %.2f %s", - avp->avg_ncpus, avp->gpu_usage.usage, - rsc_name_long(avp->gpu_usage.rsc_type) - ); - } else { - snprintf(buf, len, "%.2f CPU", avp->avg_ncpus); - } -} - // set "nused" bits of the source bitmap in the dest bitmap // static inline void set_bits( @@ -314,7 +301,7 @@ void RR_SIM::pick_jobs_to_run(double reltime) { adjust_rec_sched(rp); if (log_flags.rrsim_detail && !rp->already_selected) { char buf[256]; - rsc_string(rp, buf, sizeof(buf)); + rp->rsc_string(buf, sizeof(buf)); msg_printf(rp->project, MSG_INFO, "[rr_sim_detail] %.2f: starting %s (%s) (%.2fG/%.2fG)", reltime, rp->name, buf, rp->rrsim_flops_left/1e9, @@ -558,7 +545,7 @@ void RR_SIM::simulate() { pbest = rpbest->project; if (log_flags.rr_simulation) { char buf[256]; - rsc_string(rpbest, buf, sizeof(buf)); + rpbest->rsc_string(buf, sizeof(buf)); msg_printf(pbest, MSG_INFO, "[rr_sim] %.2f: %s finishes (%s) (%.2fG/%.2fG)", sim_now + delta_t - gstate.now, diff --git a/client/sim.cpp b/client/sim.cpp index 4698b8989c3..f82407801c5 100644 --- a/client/sim.cpp +++ b/client/sim.cpp @@ -1423,9 +1423,9 @@ void do_client_simulation() { snprintf(buf, sizeof(buf), "%s%s", infile_prefix, CONFIG_FILE); cc_config.defaults(); + log_flags.init(); read_config_file(true, buf); - log_flags.init(); snprintf(buf, sizeof(buf), "%s%s", outfile_prefix, "log_flags.xml"); f = fopen(buf, "r"); if (f) { diff --git a/client/sim_util.cpp b/client/sim_util.cpp index 5f67dae3a03..03a693cc399 100644 --- a/client/sim_util.cpp +++ b/client/sim_util.cpp @@ -128,7 +128,7 @@ int ACTIVE_TASK::resume_or_start(bool first_time) { ); } set_task_state(PROCESS_EXECUTING, "start"); - char buf[256]; + char buf[1024]; snprintf(buf, sizeof(buf), "Starting %s
  %s
  deadline %s
", result->name, result->project->get_project_name(), sim_time_string(result->report_deadline)