From 4bf79f55977f96b28de60711cad99e5b868686d7 Mon Sep 17 00:00:00 2001 From: henryzhx8 Date: Wed, 31 Jul 2024 14:14:16 +0800 Subject: [PATCH] suicide immediately when resource hard limit is met (#1649) --- core/monitor/Monitor.cpp | 93 +++++++++++++++++++++++++--------------- core/monitor/Monitor.h | 11 +++-- 2 files changed, 66 insertions(+), 38 deletions(-) diff --git a/core/monitor/Monitor.cpp b/core/monitor/Monitor.cpp index 373cc4926f..9d8e86c092 100644 --- a/core/monitor/Monitor.cpp +++ b/core/monitor/Monitor.cpp @@ -132,7 +132,7 @@ void LogtailMonitor::Stop() { void LogtailMonitor::Monitor() { LOG_INFO(sLogger, ("profiling", "started")); - int32_t lastMonitorTime = time(NULL); + int32_t lastMonitorTime = time(NULL), lastCheckHardLimitTime = time(nullptr); CpuStat curCpuStat; { unique_lock lock(mThreadRunningMux); @@ -164,40 +164,54 @@ void LogtailMonitor::Monitor() { } #endif - // Update statistics and send to logtail_status_profile regularly. - // If CPU or memory limit triggered, send to logtail_suicide_profile. - if ((monitorTime - lastMonitorTime) < INT32_FLAG(monitor_interval)) - continue; - lastMonitorTime = monitorTime; - - // Memory usage has exceeded limit, try to free some timeout objects. - if (1 == mMemStat.mViolateNum) { - LOG_DEBUG(sLogger, ("Memory is upper limit", "run gabbage collection.")); - LogInput::GetInstance()->SetForceClearFlag(true); - } - GetMemStat(); - CalCpuStat(curCpuStat, mCpuStat); - // CalCpuLimit and CalMemLimit will check if the number of violation (CPU - // or memory exceeds limit) // is greater or equal than limits ( - // flag(cpu_limit_num) and flag(mem_limit_num)). - // Returning true means too much violations, so we have to prepare to restart - // logtail to release resource. - // Mainly for controlling memory because we have no idea to descrease memory usage. - if (CheckCpuLimit() || CheckMemLimit()) { - LOG_ERROR(sLogger, - ("Resource used by program exceeds upper limit", - "prepare restart Logtail")("cpu_usage", mCpuStat.mCpuUsage)("mem_rss", mMemStat.mRss)); - Suicide(); + static int32_t checkHardLimitInterval + = INT32_FLAG(monitor_interval) > 30 ? INT32_FLAG(monitor_interval) / 6 : 5; + if ((monitorTime - lastCheckHardLimitTime) >= checkHardLimitInterval) { + lastCheckHardLimitTime = monitorTime; + + GetMemStat(); + CalCpuStat(curCpuStat, mCpuStat); + if (CheckHardCpuLimit() || CheckHardMemLimit()) { + LOG_ERROR(sLogger, + ("Resource used by program exceeds hard limit", + "prepare restart Logtail")("cpu_usage", mCpuStat.mCpuUsage)("mem_rss", mMemStat.mRss)); + Suicide(); + } } - if (IsHostIpChanged()) { - Suicide(); - } - SendStatusProfile(false); - if (BOOL_FLAG(logtail_dump_monitor_info)) { - if (!DumpMonitorInfo(monitorTime)) - LOG_ERROR(sLogger, ("Fail to dump monitor info", "")); + // Update statistics and send to logtail_status_profile regularly. + // If CPU or memory limit triggered, send to logtail_suicide_profile. + if ((monitorTime - lastMonitorTime) >= INT32_FLAG(monitor_interval)) { + lastMonitorTime = monitorTime; + + // Memory usage has exceeded limit, try to free some timeout objects. + if (1 == mMemStat.mViolateNum) { + LOG_DEBUG(sLogger, ("Memory is upper limit", "run gabbage collection.")); + LogInput::GetInstance()->SetForceClearFlag(true); + } + // CalCpuLimit and CalMemLimit will check if the number of violation (CPU + // or memory exceeds limit) // is greater or equal than limits ( + // flag(cpu_limit_num) and flag(mem_limit_num)). + // Returning true means too much violations, so we have to prepare to restart + // logtail to release resource. + // Mainly for controlling memory because we have no idea to descrease memory usage. + if (CheckSoftCpuLimit() || CheckSoftMemLimit()) { + LOG_ERROR(sLogger, + ("Resource used by program exceeds upper limit for some time", + "prepare restart Logtail")("cpu_usage", mCpuStat.mCpuUsage)("mem_rss", mMemStat.mRss)); + Suicide(); + } + + if (IsHostIpChanged()) { + Suicide(); + } + + SendStatusProfile(false); + if (BOOL_FLAG(logtail_dump_monitor_info)) { + if (!DumpMonitorInfo(monitorTime)) + LOG_ERROR(sLogger, ("Fail to dump monitor info", "")); + } } } } @@ -424,7 +438,7 @@ void LogtailMonitor::CalCpuStat(const CpuStat& curCpu, CpuStat& savedCpu) { #endif } -bool LogtailMonitor::CheckCpuLimit() { +bool LogtailMonitor::CheckSoftCpuLimit() { float cpuUsageLimit = AppConfig::GetInstance()->IsResourceAutoScale() ? AppConfig::GetInstance()->GetScaledCpuUsageUpLimit() : AppConfig::GetInstance()->GetCpuUsageUpLimit(); @@ -436,7 +450,7 @@ bool LogtailMonitor::CheckCpuLimit() { return false; } -bool LogtailMonitor::CheckMemLimit() { +bool LogtailMonitor::CheckSoftMemLimit() { if (mMemStat.mRss > AppConfig::GetInstance()->GetMemUsageUpLimit()) { if (++mMemStat.mViolateNum > INT32_FLAG(mem_limit_num)) return true; @@ -445,6 +459,17 @@ bool LogtailMonitor::CheckMemLimit() { return false; } +bool LogtailMonitor::CheckHardCpuLimit() { + float cpuUsageLimit = AppConfig::GetInstance()->IsResourceAutoScale() + ? AppConfig::GetInstance()->GetScaledCpuUsageUpLimit() + : AppConfig::GetInstance()->GetCpuUsageUpLimit(); + return mCpuStat.mCpuUsage > 10 * cpuUsageLimit; +} + +bool LogtailMonitor::CheckHardMemLimit() { + return mMemStat.mRss > 10 * AppConfig::GetInstance()->GetMemUsageUpLimit(); +} + void LogtailMonitor::DumpToLocal(const sls_logs::LogGroup& logGroup) { string dumpStr = "\n####logtail status####\n"; for (int32_t logIdx = 0; logIdx < logGroup.logs_size(); ++logIdx) { diff --git a/core/monitor/Monitor.h b/core/monitor/Monitor.h index 2fcbd54dd6..3c3267e0bb 100644 --- a/core/monitor/Monitor.h +++ b/core/monitor/Monitor.h @@ -105,12 +105,15 @@ class LogtailMonitor : public MetricStore { // set @curCpu to @savedCpu after calculation. void CalCpuStat(const CpuStat& curCpu, CpuStat& savedCpu); - // CheckCpuLimit checks if current cpu usage exceeds limit. + // CheckSoftCpuLimit checks if current cpu usage exceeds limit. // @return true if the cpu usage exceeds limit continuously. - bool CheckCpuLimit(); - // CheckMemLimit checks if the memory usage exceeds limit. + bool CheckSoftCpuLimit(); + // CheckSoftMemLimit checks if the memory usage exceeds limit. // @return true if the memory usage exceeds limit continuously. - bool CheckMemLimit(); + bool CheckSoftMemLimit(); + + bool CheckHardCpuLimit(); + bool CheckHardMemLimit(); // SendStatusProfile collects status profile and send them to server. // @suicide indicates if the target LogStore is logtail_suicide_profile.