From 6dbdb784a6f446c624bf676fe870071ffba8e0ce Mon Sep 17 00:00:00 2001 From: TE-fanxl Date: Mon, 22 Aug 2022 13:22:31 +0800 Subject: [PATCH] add watchdog timeout env so that can set timeout --- include/nbla/cuda/communicator/watch_dog.hpp | 1 + src/nbla/cuda/communicator/watch_dog.cpp | 23 +++++++++++++++----- src/nbla/cuda/test/test_watch_dog.cpp | 9 ++++++++ 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/include/nbla/cuda/communicator/watch_dog.hpp b/include/nbla/cuda/communicator/watch_dog.hpp index ef882d1a1..d61349746 100644 --- a/include/nbla/cuda/communicator/watch_dog.hpp +++ b/include/nbla/cuda/communicator/watch_dog.hpp @@ -27,6 +27,7 @@ class Watchdog { int state_; int exit_flag_; int timeout_ticks_; + int env_timeout_; std::mutex mutex_; std::condition_variable cv_; int bootup_flag_; diff --git a/src/nbla/cuda/communicator/watch_dog.cpp b/src/nbla/cuda/communicator/watch_dog.cpp index 9ef9a7375..2be75e72c 100644 --- a/src/nbla/cuda/communicator/watch_dog.cpp +++ b/src/nbla/cuda/communicator/watch_dog.cpp @@ -18,8 +18,10 @@ #include #include #include +#include namespace nbla { + void Watchdog::watch_dog_loop() { std::unique_lock lck(mutex_); { @@ -29,19 +31,23 @@ void Watchdog::watch_dog_loop() { } while (!exit_flag_) { if (state_ == START_WATCH_DOG) { + int32_t timeout_set = TICK * timeout_ticks_; + if (env_timeout_ > 0) { + timeout_set = env_timeout_; + } std::cv_status r = - cv_.wait_for(lck, std::chrono::milliseconds(TICK * timeout_ticks_)); + cv_.wait_for(lck, std::chrono::milliseconds(timeout_set)); if (r == std::cv_status::timeout) { const char *e = std::getenv("NNABLA_MPI_WATCH_DOG_ENABLE"); if (!e || *e == '0') { fprintf(stderr, "WARNING: some node stop response for %8.2f seconds!\n", - (TICK * timeout_ticks_) / 1000.0); + timeout_set / 1000.0); break; } else { NBLA_ERROR(error_code::runtime, "System stop response within %8.2f seconds!", - (TICK * timeout_ticks_) / 1000.0); + timeout_set / 1000.0); } } } else { @@ -51,9 +57,16 @@ void Watchdog::watch_dog_loop() { } Watchdog::Watchdog(int timeout_ticks) - : state_(0), exit_flag_(0), timeout_ticks_(timeout_ticks), mutex_(), cv_(), - bootup_flag_(0), bootup_(), bcv_(), in_lock_(false), + : state_(0), exit_flag_(0), timeout_ticks_(timeout_ticks), env_timeout_(-1), + mutex_(), cv_(), bootup_flag_(0), bootup_(), bcv_(), in_lock_(false), thread_(&Watchdog::watch_dog_loop, this) { + const char *c_t = std::getenv("NNABLA_MPI_WATCH_DOG_TIMEOUT"); + if (nullptr != c_t) { + int32_t t = std::stoi(c_t); + if (t > 0) + env_timeout_ = t * 1000; // user setting is n seconds. + } + std::unique_lock lck(bootup_); while (!bootup_flag_) bcv_.wait(lck); diff --git a/src/nbla/cuda/test/test_watch_dog.cpp b/src/nbla/cuda/test/test_watch_dog.cpp index c03c27d48..e949170ca 100644 --- a/src/nbla/cuda/test/test_watch_dog.cpp +++ b/src/nbla/cuda/test/test_watch_dog.cpp @@ -82,6 +82,15 @@ TEST(WatchDogTest, TestDisableWithEnv) { setenv("NNABLA_MPI_WATCH_DOG_ENABLE", "1", 1); } +TEST(WatchDogTest, TestEnvTimeout) { + setenv("NNABLA_MPI_WATCH_DOG_TIMEOUT", "3", 0); + Watchdog watch_dog(1); + { + Watchdog::WatchdogLock lck(watch_dog); + std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + } +} + #if 0 // These 2 cases are skipped. // This one is due to too long testing time.