Skip to content

Commit

Permalink
bugfix alarm trigger-times not work when alarm and recovered trigger …
Browse files Browse the repository at this point in the history
…cyclically (#1468)

Signed-off-by: tomsun28 <tomsun28@outlook.com>
  • Loading branch information
tomsun28 committed Mar 10, 2024
1 parent 867596d commit 884c766
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public class CalculateAlarm {
* key - monitorId+alertDefineId 为普通阈值告警 | The alarm is a common threshold alarm
* key - monitorId 为任务状态可用性可达性告警 | Indicates the monitoring status availability reachability alarm
*/
private final Map<String, Alert> triggeredAlertMap;
private final Map<String, Alert> triggeredAlertMap;
/**
* The not recover alert
* key - monitorId + alertDefineId + (instance)
Expand All @@ -91,8 +91,8 @@ public CalculateAlarm(AlerterWorkerPool workerPool, CommonDataQueue dataQueue,
this.alertDefineService = alertDefineService;
this.alertService = alertService;
this.bundle = ResourceBundleUtil.getBundle("alerter");
this.triggeredAlertMap = new ConcurrentHashMap<>(128);
this.notRecoveredAlertMap = new ConcurrentHashMap<>(128);
this.triggeredAlertMap = new ConcurrentHashMap<>(16);
this.notRecoveredAlertMap = new ConcurrentHashMap<>(16);
// Initialize stateAlertMap
List<Monitor> monitors = monitorDao.findMonitorsByStatus(CommonConstants.UN_AVAILABLE_CODE);
if (monitors != null) {
Expand All @@ -102,7 +102,7 @@ public CalculateAlarm(AlerterWorkerPool workerPool, CommonDataQueue dataQueue,
tags.put(TAG_MONITOR_NAME, monitor.getName());
tags.put(TAG_MONITOR_APP, monitor.getApp());
this.notRecoveredAlertMap.put(monitor.getId() + CommonConstants.AVAILABILITY,
Alert.builder().tags(tags).target(AVAILABILITY).status(UN_AVAILABLE_CODE).build());
Alert.builder().tags(tags).target(AVAILABILITY).status(ALERT_STATUS_CODE_PENDING).build());
}
}
startCalculate();
Expand Down Expand Up @@ -145,17 +145,15 @@ private void calculate(CollectRep.MetricsData metricsData) {
return;
}
List<CollectRep.Field> fields = metricsData.getFieldsList();
Map<String, Object> fieldValueMap = new HashMap<>(16);
Map<String, Object> fieldValueMap = new HashMap<>(8);
int valueRowCount = metricsData.getValuesCount();
for (Map.Entry<String, List<AlertDefine>> entry : defineMap.entrySet()) {
List<AlertDefine> defines = entry.getValue();
for (AlertDefine define : defines) {
final String expr = define.getExpr();

if (StringUtils.isBlank(expr)) {
continue;
}

if (expr.contains(SYSTEM_VALUE_ROW_COUNT)) {
fieldValueMap.put(SYSTEM_VALUE_ROW_COUNT, valueRowCount);
try {
Expand All @@ -166,9 +164,13 @@ private void calculate(CollectRep.MetricsData metricsData) {
afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, fieldValueMap, define);
// 若此阈值已被触发,则其它数据行的触发忽略
continue;
} else if (define.isRecoverNotice()) {
String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + null;
handleRecoveredAlert(currentTimeMilli, monitorId, app, define, expr, notResolvedAlertKey);
} else {
String monitorAlertKey = String.valueOf(monitorId) + define.getId();
triggeredAlertMap.remove(monitorAlertKey);
if (define.isRecoverNotice()) {
String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + null;
handleRecoveredAlert(currentTimeMilli, define, expr, notResolvedAlertKey);
}
}
} catch (Exception e) {
log.warn(e.getMessage(), e);
Expand Down Expand Up @@ -219,9 +221,13 @@ private void calculate(CollectRep.MetricsData metricsData) {
afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, fieldValueMap, define);
// 若此阈值已被触发,则其它数据行的触发忽略
break;
} else if (define.isRecoverNotice()) {
String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + (instanceBuilder.length() == 0 ? null : instanceBuilder.toString());
handleRecoveredAlert(currentTimeMilli, monitorId, app, define, expr, notResolvedAlertKey);
} else {
String monitorAlertKey = String.valueOf(monitorId) + define.getId();
triggeredAlertMap.remove(monitorAlertKey);
if (define.isRecoverNotice()) {
String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + (instanceBuilder.length() == 0 ? null : instanceBuilder.toString());
handleRecoveredAlert(currentTimeMilli, define, expr, notResolvedAlertKey);
}
}
} catch (Exception e) {
log.warn(e.getMessage(), e);
Expand All @@ -231,7 +237,7 @@ private void calculate(CollectRep.MetricsData metricsData) {
}
}

private void handleRecoveredAlert(long currentTimeMilli, long monitorId, String app, AlertDefine define, String expr, String notResolvedAlertKey) {
private void handleRecoveredAlert(long currentTimeMilli, AlertDefine define, String expr, String notResolvedAlertKey) {
Alert notResolvedAlert = notRecoveredAlertMap.remove(notResolvedAlertKey);
if (notResolvedAlert != null) {
// Sending an alarm Restore
Expand Down Expand Up @@ -261,6 +267,7 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri
triggeredAlert.setLastAlarmTime(currentTimeMilli);
int defineTimes = define.getTimes() == null ? 1 : define.getTimes();
if (times >= defineTimes) {
triggeredAlert.setStatus(ALERT_STATUS_CODE_PENDING);
String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + fieldValueMap.get("instance");
triggeredAlertMap.remove(monitorAlertKey);
notRecoveredAlertMap.put(notResolvedAlertKey, triggeredAlert);
Expand All @@ -270,7 +277,7 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri
fieldValueMap.put("app", app);
fieldValueMap.put("metrics", metrics);
fieldValueMap.put("metric", define.getField());
Map<String, String> tags = new HashMap<>(6);
Map<String, String> tags = new HashMap<>(8);
tags.put(CommonConstants.TAG_MONITOR_ID, String.valueOf(monitorId));
tags.put(CommonConstants.TAG_MONITOR_APP, app);
tags.put(CommonConstants.TAG_THRESHOLD_ID, String.valueOf(define.getId()));
Expand All @@ -283,7 +290,7 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri
Alert alert = Alert.builder()
.tags(tags)
.priority(define.getPriority())
.status(ALERT_STATUS_CODE_PENDING)
.status(ALERT_STATUS_CODE_NOT_REACH)
.target(app + "." + metrics + "." + define.getField())
.triggerTimes(1)
.firstAlarmTime(currentTimeMilli)
Expand All @@ -293,9 +300,10 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri
.build();
int defineTimes = define.getTimes() == null ? 1 : define.getTimes();
if (1 >= defineTimes) {
alert.setStatus(ALERT_STATUS_CODE_PENDING);
String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + fieldValueMap.get("instance");
notRecoveredAlertMap.put(notResolvedAlertKey, alert);
alarmCommonReduce.reduceAndSendAlarm(alert);
alarmCommonReduce.reduceAndSendAlarm(alert.clone());
} else {
triggeredAlertMap.put(monitorAlertKey, alert);
}
Expand Down Expand Up @@ -346,27 +354,22 @@ private void handlerAvailableMetrics(long monitorId, String app, CollectRep.Metr
Alert.AlertBuilder alertBuilder = Alert.builder()
.tags(tags)
.priority(avaAlertDefine.getPriority())
.status(ALERT_STATUS_CODE_PENDING)
.status(ALERT_STATUS_CODE_NOT_REACH)
.target(CommonConstants.AVAILABILITY)
.content(AlertTemplateUtil.render(avaAlertDefine.getTemplate(), valueMap))
.firstAlarmTime(currentTimeMill)
.lastAlarmTime(currentTimeMill)
.triggerTimes(1);
if (avaAlertDefine.getTimes() == null || avaAlertDefine.getTimes() <= 1) {
String notResolvedAlertKey = monitorId + CommonConstants.AVAILABILITY;
alertBuilder.status(ALERT_STATUS_CODE_PENDING);
notRecoveredAlertMap.put(notResolvedAlertKey, alertBuilder.build());
alarmCommonReduce.reduceAndSendAlarm(alertBuilder.build().clone());
alarmCommonReduce.reduceAndSendAlarm(alertBuilder.build());
} else {
alertBuilder.status(CommonConstants.ALERT_STATUS_CODE_NOT_REACH);
triggeredAlertMap.put(String.valueOf(monitorId), alertBuilder.build());
}
triggeredAlertMap.put(String.valueOf(monitorId), alertBuilder.build());
} else {
int times = preAlert.getTriggerTimes() + 1;
if (preAlert.getStatus() == ALERT_STATUS_CODE_PENDING) {
times = 1;
preAlert.setContent(AlertTemplateUtil.render(avaAlertDefine.getTemplate(), valueMap));
preAlert.setTags(tags);
}
preAlert.setTriggerTimes(times);
preAlert.setFirstAlarmTime(currentTimeMill);
preAlert.setLastAlarmTime(currentTimeMill);
Expand All @@ -376,14 +379,14 @@ private void handlerAvailableMetrics(long monitorId, String app, CollectRep.Metr
String notResolvedAlertKey = monitorId + CommonConstants.AVAILABILITY;
notRecoveredAlertMap.put(notResolvedAlertKey, preAlert.clone());
alarmCommonReduce.reduceAndSendAlarm(preAlert.clone());
} else {
preAlert.setStatus(CommonConstants.ALERT_STATUS_CODE_NOT_REACH);
triggeredAlertMap.remove(String.valueOf(monitorId));
}
}
} else {
// Check whether an availability or unreachable alarm is generated before the association monitoring
// and send a clear alarm to clear the monitoring status
// 判断关联监控之前是否有可用性或者不可达告警,发送恢复告警进行任务状态恢复
triggeredAlertMap.remove(String.valueOf(monitorId));
String notResolvedAlertKey = monitorId + CommonConstants.AVAILABILITY;
Alert notResolvedAlert = notRecoveredAlertMap.remove(notResolvedAlertKey);
if (notResolvedAlert != null) {
Expand Down
2 changes: 1 addition & 1 deletion web-app/src/assets/i18n/zh-CN.json
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@
"alert.center.tags": "标签",
"alert.center.status": "状态",
"alert.center.time": "告警时间",
"alert.center.time.tip": "此告警期间统计触发 {{times}} 次告警",
"alert.center.time.tip": "此告警期间累计触发 {{times}} 次告警",
"alert.center.first-time": "开始",
"alert.center.last-time": "最新",
"alert.center.confirm.delete": "请确认是否删除!",
Expand Down
2 changes: 1 addition & 1 deletion web-app/src/assets/i18n/zh-TW.json
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@
"alert.center.tags": "標簽",
"alert.center.status": "狀態",
"alert.center.time": "告警時間",
"alert.center.time.tip": "此告警期間統計觸發 {{times}} 次告警",
"alert.center.time.tip": "此告警期間累計觸發 {{times}} 次告警",
"alert.center.first-time": "開始",
"alert.center.last-time": "最新",
"alert.center.confirm.delete": "請確認是否刪除!",
Expand Down

0 comments on commit 884c766

Please sign in to comment.