Skip to content

Commit

Permalink
suicide immediately when resource hard limit is met (#1649)
Browse files Browse the repository at this point in the history
  • Loading branch information
henryzhx8 authored and linrunqi08 committed Jul 31, 2024
1 parent 0c64478 commit 4bf79f5
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 38 deletions.
93 changes: 59 additions & 34 deletions core/monitor/Monitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ void LogtailMonitor::Stop() {

void LogtailMonitor::Monitor() {
LOG_INFO(sLogger, ("profiling", "started"));
int32_t lastMonitorTime = time(NULL);
int32_t lastMonitorTime = time(NULL), lastCheckHardLimitTime = time(nullptr);
CpuStat curCpuStat;
{
unique_lock<mutex> lock(mThreadRunningMux);
Expand Down Expand Up @@ -164,40 +164,54 @@ void LogtailMonitor::Monitor() {
}
#endif

// Update statistics and send to logtail_status_profile regularly.
// If CPU or memory limit triggered, send to logtail_suicide_profile.
if ((monitorTime - lastMonitorTime) < INT32_FLAG(monitor_interval))
continue;
lastMonitorTime = monitorTime;

// Memory usage has exceeded limit, try to free some timeout objects.
if (1 == mMemStat.mViolateNum) {
LOG_DEBUG(sLogger, ("Memory is upper limit", "run gabbage collection."));
LogInput::GetInstance()->SetForceClearFlag(true);
}
GetMemStat();
CalCpuStat(curCpuStat, mCpuStat);
// CalCpuLimit and CalMemLimit will check if the number of violation (CPU
// or memory exceeds limit) // is greater or equal than limits (
// flag(cpu_limit_num) and flag(mem_limit_num)).
// Returning true means too much violations, so we have to prepare to restart
// logtail to release resource.
// Mainly for controlling memory because we have no idea to descrease memory usage.
if (CheckCpuLimit() || CheckMemLimit()) {
LOG_ERROR(sLogger,
("Resource used by program exceeds upper limit",
"prepare restart Logtail")("cpu_usage", mCpuStat.mCpuUsage)("mem_rss", mMemStat.mRss));
Suicide();
static int32_t checkHardLimitInterval
= INT32_FLAG(monitor_interval) > 30 ? INT32_FLAG(monitor_interval) / 6 : 5;
if ((monitorTime - lastCheckHardLimitTime) >= checkHardLimitInterval) {
lastCheckHardLimitTime = monitorTime;

GetMemStat();
CalCpuStat(curCpuStat, mCpuStat);
if (CheckHardCpuLimit() || CheckHardMemLimit()) {
LOG_ERROR(sLogger,
("Resource used by program exceeds hard limit",
"prepare restart Logtail")("cpu_usage", mCpuStat.mCpuUsage)("mem_rss", mMemStat.mRss));
Suicide();
}
}

if (IsHostIpChanged()) {
Suicide();
}

SendStatusProfile(false);
if (BOOL_FLAG(logtail_dump_monitor_info)) {
if (!DumpMonitorInfo(monitorTime))
LOG_ERROR(sLogger, ("Fail to dump monitor info", ""));
// Update statistics and send to logtail_status_profile regularly.
// If CPU or memory limit triggered, send to logtail_suicide_profile.
if ((monitorTime - lastMonitorTime) >= INT32_FLAG(monitor_interval)) {
lastMonitorTime = monitorTime;

// Memory usage has exceeded limit, try to free some timeout objects.
if (1 == mMemStat.mViolateNum) {
LOG_DEBUG(sLogger, ("Memory is upper limit", "run gabbage collection."));
LogInput::GetInstance()->SetForceClearFlag(true);
}
// CalCpuLimit and CalMemLimit will check if the number of violation (CPU
// or memory exceeds limit) // is greater or equal than limits (
// flag(cpu_limit_num) and flag(mem_limit_num)).
// Returning true means too much violations, so we have to prepare to restart
// logtail to release resource.
// Mainly for controlling memory because we have no idea to descrease memory usage.
if (CheckSoftCpuLimit() || CheckSoftMemLimit()) {
LOG_ERROR(sLogger,
("Resource used by program exceeds upper limit for some time",
"prepare restart Logtail")("cpu_usage", mCpuStat.mCpuUsage)("mem_rss", mMemStat.mRss));
Suicide();
}

if (IsHostIpChanged()) {
Suicide();
}

SendStatusProfile(false);
if (BOOL_FLAG(logtail_dump_monitor_info)) {
if (!DumpMonitorInfo(monitorTime))
LOG_ERROR(sLogger, ("Fail to dump monitor info", ""));
}
}
}
}
Expand Down Expand Up @@ -424,7 +438,7 @@ void LogtailMonitor::CalCpuStat(const CpuStat& curCpu, CpuStat& savedCpu) {
#endif
}

bool LogtailMonitor::CheckCpuLimit() {
bool LogtailMonitor::CheckSoftCpuLimit() {
float cpuUsageLimit = AppConfig::GetInstance()->IsResourceAutoScale()
? AppConfig::GetInstance()->GetScaledCpuUsageUpLimit()
: AppConfig::GetInstance()->GetCpuUsageUpLimit();
Expand All @@ -436,7 +450,7 @@ bool LogtailMonitor::CheckCpuLimit() {
return false;
}

bool LogtailMonitor::CheckMemLimit() {
bool LogtailMonitor::CheckSoftMemLimit() {
if (mMemStat.mRss > AppConfig::GetInstance()->GetMemUsageUpLimit()) {
if (++mMemStat.mViolateNum > INT32_FLAG(mem_limit_num))
return true;
Expand All @@ -445,6 +459,17 @@ bool LogtailMonitor::CheckMemLimit() {
return false;
}

bool LogtailMonitor::CheckHardCpuLimit() {
float cpuUsageLimit = AppConfig::GetInstance()->IsResourceAutoScale()
? AppConfig::GetInstance()->GetScaledCpuUsageUpLimit()
: AppConfig::GetInstance()->GetCpuUsageUpLimit();
return mCpuStat.mCpuUsage > 10 * cpuUsageLimit;
}

bool LogtailMonitor::CheckHardMemLimit() {
return mMemStat.mRss > 10 * AppConfig::GetInstance()->GetMemUsageUpLimit();
}

void LogtailMonitor::DumpToLocal(const sls_logs::LogGroup& logGroup) {
string dumpStr = "\n####logtail status####\n";
for (int32_t logIdx = 0; logIdx < logGroup.logs_size(); ++logIdx) {
Expand Down
11 changes: 7 additions & 4 deletions core/monitor/Monitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,15 @@ class LogtailMonitor : public MetricStore {
// set @curCpu to @savedCpu after calculation.
void CalCpuStat(const CpuStat& curCpu, CpuStat& savedCpu);

// CheckCpuLimit checks if current cpu usage exceeds limit.
// CheckSoftCpuLimit checks if current cpu usage exceeds limit.
// @return true if the cpu usage exceeds limit continuously.
bool CheckCpuLimit();
// CheckMemLimit checks if the memory usage exceeds limit.
bool CheckSoftCpuLimit();
// CheckSoftMemLimit checks if the memory usage exceeds limit.
// @return true if the memory usage exceeds limit continuously.
bool CheckMemLimit();
bool CheckSoftMemLimit();

bool CheckHardCpuLimit();
bool CheckHardMemLimit();

// SendStatusProfile collects status profile and send them to server.
// @suicide indicates if the target LogStore is logtail_suicide_profile.
Expand Down

0 comments on commit 4bf79f5

Please sign in to comment.