Skip to content

Commit

Permalink
alarm calculate ignore metrics collect code - TIMEOUT (#1478)
Browse files Browse the repository at this point in the history
Signed-off-by: tomsun28 <tomsun28@outlook.com>
  • Loading branch information
tomsun28 authored Jan 11, 2024
1 parent e9913d7 commit f9bbd78
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,13 @@ public class CalculateAlarm {
/**
* The alarm in the process is triggered
* 触发中告警信息
* key - monitorId+alertDefineId 为普通阈值告警 | The alarm is a common threshold alarm
* key - monitorId+alertDefineId+tags 为普通阈值告警 | The alarm is a common threshold alarm
* key - monitorId 为任务状态可用性可达性告警 | Indicates the monitoring status availability reachability alarm
*/
private final Map<String, Alert> triggeredAlertMap;
/**
* The not recover alert
* key - monitorId + alertDefineId + (instance)
* key - monitorId + alertDefineId + tags
*/
private final Map<String, Alert> notRecoveredAlertMap;
private final AlerterWorkerPool workerPool;
Expand Down Expand Up @@ -154,27 +154,28 @@ private void calculate(CollectRep.MetricsData metricsData) {
if (StringUtils.isBlank(expr)) {
continue;
}
if (expr.contains(SYSTEM_VALUE_ROW_COUNT)) {
if (expr.contains(SYSTEM_VALUE_ROW_COUNT) && metricsData.getValuesCount() == 0) {
fieldValueMap.put(SYSTEM_VALUE_ROW_COUNT, valueRowCount);
try {
boolean match = execAlertExpression(fieldValueMap, expr);
if (match) {
// If the threshold rule matches, the number of times the threshold has been triggered is determined and an alarm is triggered
// 阈值规则匹配,判断已触发阈值次数,触发告警
afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, fieldValueMap, define);
// 若此阈值已被触发,则其它数据行的触发忽略
continue;
} else {
String monitorAlertKey = String.valueOf(monitorId) + define.getId();
triggeredAlertMap.remove(monitorAlertKey);
if (define.isRecoverNotice()) {
String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + null;
handleRecoveredAlert(currentTimeMilli, define, expr, notResolvedAlertKey);
try {
if (match) {
// If the threshold rule matches, the number of times the threshold has been triggered is determined and an alarm is triggered
// 阈值规则匹配,判断已触发阈值次数,触发告警
afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, "", fieldValueMap, define);
// 若此阈值已被触发,则其它数据行的触发忽略
continue;
} else {
String alarmKey = String.valueOf(monitorId) + define.getId();
triggeredAlertMap.remove(alarmKey);
if (define.isRecoverNotice()) {
handleRecoveredAlert(currentTimeMilli, define, expr, alarmKey);
}
}
} catch (Exception e) {
log.error(e.getMessage(), e);
}
} catch (Exception e) {
log.warn(e.getMessage(), e);
}
} catch (Exception ignored) {}
}
for (CollectRep.ValueRow valueRow : metricsData.getValuesList()) {

Expand All @@ -183,7 +184,7 @@ private void calculate(CollectRep.MetricsData metricsData) {
}
fieldValueMap.clear();
fieldValueMap.put(SYSTEM_VALUE_ROW_COUNT, valueRowCount);
StringBuilder instanceBuilder = new StringBuilder();
StringBuilder tagBuilder = new StringBuilder();
for (int index = 0; index < valueRow.getColumnsList().size(); index++) {
String valueStr = valueRow.getColumns(index);
if (CommonConstants.NULL_VALUE.equals(valueStr)) {
Expand All @@ -210,35 +211,36 @@ private void calculate(CollectRep.MetricsData metricsData) {
}

if (field.getLabel()) {
instanceBuilder.append(valueStr).append("-");
tagBuilder.append("-").append(valueStr);
}
}
try {
boolean match = execAlertExpression(fieldValueMap, expr);
if (match) {
// If the threshold rule matches, the number of times the threshold has been triggered is determined and an alarm is triggered
// 阈值规则匹配,判断已触发阈值次数,触发告警
afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, fieldValueMap, define);
// 若此阈值已被触发,则其它数据行的触发忽略
break;
} else {
String monitorAlertKey = String.valueOf(monitorId) + define.getId();
triggeredAlertMap.remove(monitorAlertKey);
if (define.isRecoverNotice()) {
String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + (instanceBuilder.length() == 0 ? null : instanceBuilder.toString());
handleRecoveredAlert(currentTimeMilli, define, expr, notResolvedAlertKey);
}
try {
if (match) {
// If the threshold rule matches, the number of times the threshold has been triggered is determined and an alarm is triggered
// 阈值规则匹配,判断已触发阈值次数,触发告警
afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, tagBuilder.toString(), fieldValueMap, define);
// 若此阈值已被触发,则其它数据行的触发忽略
break;
} else {
String alarmKey = String.valueOf(monitorId) + define.getId() + tagBuilder;
triggeredAlertMap.remove(alarmKey);
if (define.isRecoverNotice()) {
handleRecoveredAlert(currentTimeMilli, define, expr, alarmKey);
}
}
} catch (Exception e) {
log.error(e.getMessage(), e);
}
} catch (Exception e) {
log.warn(e.getMessage(), e);
}
} catch (Exception ignored) {}
}
}
}
}

private void handleRecoveredAlert(long currentTimeMilli, AlertDefine define, String expr, String notResolvedAlertKey) {
Alert notResolvedAlert = notRecoveredAlertMap.remove(notResolvedAlertKey);
private void handleRecoveredAlert(long currentTimeMilli, AlertDefine define, String expr, String alarmKey) {
Alert notResolvedAlert = notRecoveredAlertMap.remove(alarmKey);
if (notResolvedAlert != null) {
// Sending an alarm Restore
Map<String, String> tags = notResolvedAlert.getTags();
Expand All @@ -257,9 +259,10 @@ private void handleRecoveredAlert(long currentTimeMilli, AlertDefine define, Str
}
}

private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, String app, String metrics, Map<String, Object> fieldValueMap, AlertDefine define) {
String monitorAlertKey = String.valueOf(monitorId) + define.getId();
Alert triggeredAlert = triggeredAlertMap.get(monitorAlertKey);
private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, String app, String metrics, String tagStr,
Map<String, Object> fieldValueMap, AlertDefine define) {
String alarmKey = String.valueOf(monitorId) + define.getId() + tagStr;
Alert triggeredAlert = triggeredAlertMap.get(alarmKey);
if (triggeredAlert != null) {
int times = triggeredAlert.getTriggerTimes() + 1;
triggeredAlert.setTriggerTimes(times);
Expand All @@ -268,15 +271,14 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri
int defineTimes = define.getTimes() == null ? 1 : define.getTimes();
if (times >= defineTimes) {
triggeredAlert.setStatus(ALERT_STATUS_CODE_PENDING);
String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + fieldValueMap.get("instance");
triggeredAlertMap.remove(monitorAlertKey);
notRecoveredAlertMap.put(notResolvedAlertKey, triggeredAlert);
triggeredAlertMap.remove(alarmKey);
notRecoveredAlertMap.put(alarmKey, triggeredAlert);
alarmCommonReduce.reduceAndSendAlarm(triggeredAlert.clone());
}
} else {
fieldValueMap.put("app", app);
fieldValueMap.put("metrics", metrics);
fieldValueMap.put("metric", define.getField());
fieldValueMap.put(TAG_MONITOR_APP, app);
fieldValueMap.put(TAG_METRICS, metrics);
fieldValueMap.put(TAG_METRIC, define.getField());
Map<String, String> tags = new HashMap<>(8);
tags.put(CommonConstants.TAG_MONITOR_ID, String.valueOf(monitorId));
tags.put(CommonConstants.TAG_MONITOR_APP, app);
Expand All @@ -301,46 +303,56 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri
int defineTimes = define.getTimes() == null ? 1 : define.getTimes();
if (1 >= defineTimes) {
alert.setStatus(ALERT_STATUS_CODE_PENDING);
String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + fieldValueMap.get("instance");
notRecoveredAlertMap.put(notResolvedAlertKey, alert);
notRecoveredAlertMap.put(alarmKey, alert);
alarmCommonReduce.reduceAndSendAlarm(alert.clone());
} else {
triggeredAlertMap.put(monitorAlertKey, alert);
triggeredAlertMap.put(alarmKey, alert);
}
}
}

private boolean execAlertExpression(Map<String, Object> fieldValueMap, String expr) {
Boolean match = false;
Boolean match;
try {
Expression expression = AviatorEvaluator.compile(expr, true);
expression.getVariableNames().forEach(variable -> {
if (!fieldValueMap.containsKey(variable)) {
throw new ExpressionRuntimeException("metrics value not contains expr field: " + variable);
}
});
match = (Boolean) expression.execute(fieldValueMap);
} catch (CompileExpressionErrorException |
ExpressionSyntaxErrorException compileException) {
log.error("Alert Define Rule: {} Compile Error: {}.", expr, compileException.getMessage());
throw compileException;
} catch (ExpressionRuntimeException expressionRuntimeException) {
log.error("Alert Define Rule: {} Run Error: {}.", expr, expressionRuntimeException.getMessage());
throw expressionRuntimeException;
} catch (Exception e) {
log.error("Alert Define Rule: {} Run Error: {}.", e, e.getMessage());
log.error("Alert Define Rule: {} Unknown Error: {}.", expr, e.getMessage());
throw e;
}
return match != null && match;
}

private void handlerAvailableMetrics(long monitorId, String app, CollectRep.MetricsData metricsData) {
if (metricsData.getCode() == CollectRep.Code.TIMEOUT) {
return;
}
// TODO CACHE getMonitorBindAlertAvaDefine
AlertDefine avaAlertDefine = alertDefineService.getMonitorBindAlertAvaDefine(monitorId, app, CommonConstants.AVAILABILITY);
if (avaAlertDefine == null) {
return;
}
long currentTimeMill = System.currentTimeMillis();
if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
if (metricsData.getCode() != CollectRep.Code.SUCCESS ) {
Alert preAlert = triggeredAlertMap.get(String.valueOf(monitorId));
Map<String, String> tags = new HashMap<>(6);
tags.put(CommonConstants.TAG_MONITOR_ID, String.valueOf(monitorId));
tags.put(CommonConstants.TAG_MONITOR_APP, app);
tags.put(CommonConstants.TAG_THRESHOLD_ID, String.valueOf(avaAlertDefine.getId()));
tags.put("metrics", CommonConstants.AVAILABILITY);
tags.put("code", metricsData.getCode().name());
tags.put(TAG_METRICS, CommonConstants.AVAILABILITY);
tags.put(TAG_CODE, metricsData.getCode().name());
Map<String, Object> valueMap = tags.entrySet().stream()
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ public void collect(CollectRep.MetricsData.Builder builder, long monitorId, Stri
} catch (PortUnreachableException portUnreachableException) {
String errorMsg = CommonUtil.getMessageFromThrowable(portUnreachableException);
log.info(errorMsg);
builder.setCode(CollectRep.Code.UN_AVAILABLE);
builder.setCode(CollectRep.Code.UN_REACHABLE);
builder.setMsg("Peer port unreachable");
} catch (Exception exception) {
String errorMsg = CommonUtil.getMessageFromThrowable(exception);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ public TimerDispatcher() {
ret.setDaemon(true);
return ret;
}, 1, TimeUnit.SECONDS, 512);
this.currentCyclicTaskMap = new ConcurrentHashMap<>(64);
this.currentCyclicTaskMap = new ConcurrentHashMap<>(8);
this.currentTempTaskMap = new ConcurrentHashMap<>(8);
this.eventListeners = new ConcurrentHashMap<>(8);
this.started = new AtomicBoolean(true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,19 @@ public interface CommonConstants {
String TAG_MONITOR_APP = "app";

/**
* 内有标签: alarm type
* 内有标签: metrics
*/
String TAG_ALARM_TYPE = "type";
String TAG_METRICS = "metrics";

/**
* 内有标签: metric
*/
String TAG_METRIC = "metric";

/**
* 内有标签: code
*/
String TAG_CODE = "code";

/**
* notice_period type 类型字段, 每日类型
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ public List<CollectRep.MetricsData> getCurrentMetricsData(@NonNull Long monitorI
public void saveData(CollectRep.MetricsData metricsData) {
String key = String.valueOf(metricsData.getId());
String hashKey = metricsData.getMetrics();
if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
if (metricsData.getCode() != CollectRep.Code.SUCCESS || !isServerAvailable()) {
return;
}
if (metricsData.getValuesList().isEmpty()) {
Expand Down

0 comments on commit f9bbd78

Please sign in to comment.