diff --git a/documentation/pestpp_users_guide_v5.2.15.docx b/documentation/pestpp_users_guide_v5.2.16.docx
similarity index 73%
rename from documentation/pestpp_users_guide_v5.2.15.docx
rename to documentation/pestpp_users_guide_v5.2.16.docx
index 92a60ec9..ef07906b 100644
Binary files a/documentation/pestpp_users_guide_v5.2.15.docx and b/documentation/pestpp_users_guide_v5.2.16.docx differ
diff --git a/documentation/pestpp_users_manual.md b/documentation/pestpp_users_manual.md
index 42c3e78f..4e7328b7 100644
--- a/documentation/pestpp_users_manual.md
+++ b/documentation/pestpp_users_manual.md
@@ -1,13 +1,13 @@
-# Version 5.2.15
+# Version 5.2.16
PEST++ Development Team
-November 2024
+December 2024
# Acknowledgements
@@ -70,7 +70,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
# Table of Contents
-- [Version 5.2.15](#s1)
+- [Version 5.2.16](#s1)
- [Acknowledgements](#s2)
- [Preface](#s3)
- [License](#s4)
diff --git a/src/libs/common/config_os.h b/src/libs/common/config_os.h
index 2b1d663a..a832d243 100644
--- a/src/libs/common/config_os.h
+++ b/src/libs/common/config_os.h
@@ -2,7 +2,7 @@
#define CONFIG_OS_H_
-#define PESTPP_VERSION "5.2.15";
+#define PESTPP_VERSION "5.2.16";
#if defined(_WIN32) || defined(_WIN64)
#define OS_WIN
diff --git a/src/libs/pestpp_common/EnsembleMethodUtils.cpp b/src/libs/pestpp_common/EnsembleMethodUtils.cpp
index 9756f1bf..003ef593 100644
--- a/src/libs/pestpp_common/EnsembleMethodUtils.cpp
+++ b/src/libs/pestpp_common/EnsembleMethodUtils.cpp
@@ -7750,7 +7750,7 @@ void EnsembleMethod::reset_par_ensemble_to_prior_mean(){
ss << "iteration:" << iter;
vector temp;
ofstream& frec = file_manager.rec_ofstream();
- oe = oe_base;
+ oe.reserve(oe_base.get_real_names(),oe.get_var_names());
weights = weights_base;
run_ensemble_util(performance_log,frec,new_pe,oe,run_mgr_ptr,false,temp,NetPackage::NULL_DA_CYCLE, ss.str());
pe = new_pe;
diff --git a/src/libs/run_managers/yamr/PantherAgent.cpp b/src/libs/run_managers/yamr/PantherAgent.cpp
index 702af11e..f985bce6 100644
--- a/src/libs/run_managers/yamr/PantherAgent.cpp
+++ b/src/libs/run_managers/yamr/PantherAgent.cpp
@@ -151,7 +151,7 @@ void PANTHERAgent::process_ctl_file(const string &ctl_filename)
mi.set_fill_tpl_zeros(pest_scenario.get_pestpp_options().get_fill_tpl_zeros());
mi.set_tpl_force_decimal(pest_scenario.get_pestpp_options().get_tpl_force_decimal());
mi.set_num_threads(pest_scenario.get_pestpp_options().get_num_tpl_ins_threads());
- mi.set_sleep_ms(100);
+ mi.set_sleep_ms(5);
restart_on_error = pest_scenario.get_pestpp_options().get_panther_agent_restart_on_error();
max_time_without_master_ping_seconds = pest_scenario.get_pestpp_options().get_panther_agent_no_ping_timeout_secs();
FileManager fm("panther_agent");
@@ -538,7 +538,7 @@ std::pair PANTHERAgent::run_model(Parameters &
void PANTHERAgent::run_async(pest_utils::thread_flag* terminate, pest_utils::thread_flag* finished, exception_ptr& run_exception,
Parameters* pars, Observations* obs)
{
- mi.set_sleep_ms(100);
+ mi.set_sleep_ms(5);
mi.run(terminate,finished,run_exception, pars, obs);
}
diff --git a/src/libs/run_managers/yamr/RunManagerPanther.cpp b/src/libs/run_managers/yamr/RunManagerPanther.cpp
index 00c17552..6140d0d2 100644
--- a/src/libs/run_managers/yamr/RunManagerPanther.cpp
+++ b/src/libs/run_managers/yamr/RunManagerPanther.cpp
@@ -50,8 +50,9 @@ const int RunManagerPanther::N_PINGS_UNRESPONSIVE = 3;
const int RunManagerPanther::MIN_PING_INTERVAL_SECS = 60; // Ping each slave at most once every minute
const int RunManagerPanther::MAX_PING_INTERVAL_SECS = 120; // Ping each slave at least once every 2 minutes
const int RunManagerPanther::MAX_CONCURRENT_RUNS_LOWER_LIMIT = 1;
-const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)
-
+const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)
+const double RunManagerPanther::MIN_AVGRUNMINS_FOR_KILL = 0.08; //minimum avg runtime to try to kill and/or resched runs
+const int RunManagerPanther::SECONDS_BETWEEN_ECHOS = 1;
AgentInfoRec::AgentInfoRec(int _socket_fd)
{
@@ -520,6 +521,7 @@ RunManagerAbstract::RUN_UNTIL_COND RunManagerPanther::run_until(RUN_UNTIL_COND c
}
std::chrono::system_clock::time_point start_time = std::chrono::system_clock::now();
+ last_echo_time = std::chrono::system_clock::now();
double run_time_sec = 0.0;
while (!all_runs_complete() && terminate_reason == RUN_UNTIL_COND::NORMAL)
{
@@ -560,7 +562,7 @@ RunManagerAbstract::RUN_UNTIL_COND RunManagerPanther::run_until(RUN_UNTIL_COND c
}
}
- w_sleep(100);
+ w_sleep(10);
n_no_ops = 0;
while (true)
{
@@ -726,7 +728,7 @@ void RunManagerPanther::run_idle_async()
idling.set(false);
// Sleep 1s to avoid spinlock
- w_sleep(100);
+ w_sleep(10);
continue;
}
@@ -816,7 +818,7 @@ void RunManagerPanther::end_run_idle_async()
}
// Sleep to avoid spinlock
- w_sleep(50);
+ w_sleep(10);
}
report("Stopped idle ping thread, as Panther manager is shutting down.", false);
@@ -857,7 +859,7 @@ void RunManagerPanther::pause_idle()
}
// Sleep to avoid spinlock
- w_sleep(50);
+ w_sleep(10);
}
report("Panther idle ping thread paused prior to scheduling runs.", false);
@@ -947,7 +949,7 @@ bool RunManagerPanther::listen(pest_utils::thread_flag* terminate/* = nullptr*/)
fd_set read_fds; // temp file descriptor list for select()
socklen_t addr_len;
timeval tv;
- tv.tv_sec = 1;
+ tv.tv_sec = 0;
tv.tv_usec = 0;
read_fds = master; // copy it
if (w_select(fdmax+1, &read_fds, NULL, NULL, &tv) == -1)
@@ -1006,7 +1008,7 @@ void RunManagerPanther::close_agents()
sock_nums.push_back(si.first);
for (auto si : sock_nums)
close_agent(si);
- w_sleep(100);
+ w_sleep(10);
}
}
@@ -1107,7 +1109,7 @@ void RunManagerPanther::schedule_runs()
duration = it_agent->get_duration_minute();
avg_runtime = it_agent->get_runtime_minute();
if (avg_runtime <= 0) avg_runtime = global_avg_runtime;
- if (avg_runtime <= 0) avg_runtime = 1.0E+10;
+ if (avg_runtime <= 0) avg_runtime = 1.0E+300;
vector overdue_kill_runs_vec = get_overdue_runs_over_kill_threshold(run_id);
if (failure_map.count(run_id) + overdue_kill_runs_vec.size() >= max_n_failure)
@@ -1131,7 +1133,9 @@ void RunManagerPanther::schedule_runs()
should_schedule = true;
model_runs_timed_out += overdue_kill_runs_vec.size();
}
- else if (((duration > overdue_giveup_minutes) || (duration > avg_runtime*overdue_giveup_fac))
+
+ else if (((duration > overdue_giveup_minutes) || ((duration > avg_runtime*overdue_giveup_fac) &&
+ (avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
&& free_agent_list.empty())
{
// If there are no free slaves kill the overdue ones
@@ -1147,7 +1151,8 @@ void RunManagerPanther::schedule_runs()
}
model_runs_timed_out += 1;
}
- else if (duration > avg_runtime*overdue_reched_fac)
+
+ else if ((duration > avg_runtime*overdue_reched_fac) && (avg_runtime > MIN_AVGRUNMINS_FOR_KILL))
{
//check how many concurrent runs are going
if (n_concur < max_concurrent_runs) should_schedule = true;
@@ -1285,6 +1290,10 @@ void RunManagerPanther::echo()
{
if (!should_echo)
return;
+ std::chrono::system_clock::time_point now = chrono::system_clock::now();
+ if (chrono::duration_cast ( now- last_echo_time).count() < SECONDS_BETWEEN_ECHOS)
+ return;
+ last_echo_time = now;
map stats_map = get_agent_stats();
cout << get_time_string_short() << " mn:" << setw(5) << setprecision(2) << left << get_global_runtime_minute() << " runs("
<< "C" << setw(5) << left << model_runs_done
@@ -1939,7 +1948,9 @@ void RunManagerPanther::kill_all_active_runs()
if (avg_runtime <= 0) avg_runtime = get_global_runtime_minute();;
if (avg_runtime <= 0) avg_runtime = 1.0E+10;
duration = i->second->get_duration_minute();
- if ((just_quit) || (duration > overdue_giveup_minutes) || (duration >= avg_runtime*overdue_giveup_fac))
+ if ((just_quit) || (duration > overdue_giveup_minutes) ||
+ ((duration >= avg_runtime*overdue_giveup_fac) &&
+ (avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
{
sock_id_vec.push_back(i->second->get_socket_fd());
}
@@ -2132,7 +2143,7 @@ RunManagerPanther::~RunManagerPanther(void)
err = w_close(listener);
FD_CLR(listener, &master);
// this is needed to ensure that the first slave closes properly
- w_sleep(500);
+ w_sleep(10);
for (int i = 0; i <= fdmax; i++)
{
if (FD_ISSET(i, &master))
@@ -2248,10 +2259,10 @@ void RunManagerYAMRCondor::cleanup(int cluster)
stringstream ss;
ss << "condor_rm " << cluster << " 1>cr_temp.stdout 2>cr_temp.stderr";
system(ss.str().c_str());
- w_sleep(500);
+ w_sleep(10);
ss.str(string());
ss << "condor_rm " << cluster << " -forcex 1>cr_temp.stdout 2>cr_temp.stderr";
- w_sleep(500);
+ w_sleep(10);
system(ss.str().c_str());
RunManagerPanther::close_agents();
cout << " all agents freed " << endl << endl;
diff --git a/src/libs/run_managers/yamr/RunManagerPanther.h b/src/libs/run_managers/yamr/RunManagerPanther.h
index 8809121e..535813fd 100644
--- a/src/libs/run_managers/yamr/RunManagerPanther.h
+++ b/src/libs/run_managers/yamr/RunManagerPanther.h
@@ -127,7 +127,8 @@ class RunManagerPanther : public RunManagerAbstract
static const int MAX_PING_INTERVAL_SECS;
static const int MAX_CONCURRENT_RUNS_LOWER_LIMIT;
static const int IDLE_THREAD_SIGNAL_TIMEOUT_SECS;
-
+ static const double MIN_AVGRUNMINS_FOR_KILL;
+ static const int SECONDS_BETWEEN_ECHOS;
double overdue_reched_fac;
double overdue_giveup_fac;
double overdue_giveup_minutes;
@@ -141,6 +142,7 @@ class RunManagerPanther : public RunManagerAbstract
long long bytes_transferred;
int files_transferred;
bool should_echo;
+ std::chrono::system_clock::time_point last_echo_time;
int nftx;
fd_set master; // master file descriptor list
list agent_info_set;