From 7255219ce64d25b6ad08a2f6f7214ff606e480ac Mon Sep 17 00:00:00 2001 From: tomsun28 Date: Tue, 9 Jan 2024 18:29:20 +0800 Subject: [PATCH 1/2] alarm calculate ignore metrics collect code - TIMEOUT Signed-off-by: tomsun28 --- .../dromara/hertzbeat/alert/calculate/CalculateAlarm.java | 5 ++++- .../hertzbeat/collector/collect/udp/UdpCollectImpl.java | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java b/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java index 5ea6b55ed7c..661def1a964 100644 --- a/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java +++ b/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java @@ -327,13 +327,16 @@ private boolean execAlertExpression(Map fieldValueMap, String ex } private void handlerAvailableMetrics(long monitorId, String app, CollectRep.MetricsData metricsData) { + if (metricsData.getCode() == CollectRep.Code.TIMEOUT) { + return; + } // TODO CACHE getMonitorBindAlertAvaDefine AlertDefine avaAlertDefine = alertDefineService.getMonitorBindAlertAvaDefine(monitorId, app, CommonConstants.AVAILABILITY); if (avaAlertDefine == null) { return; } long currentTimeMill = System.currentTimeMillis(); - if (metricsData.getCode() != CollectRep.Code.SUCCESS) { + if (metricsData.getCode() != CollectRep.Code.SUCCESS ) { Alert preAlert = triggeredAlertMap.get(String.valueOf(monitorId)); Map tags = new HashMap<>(6); tags.put(CommonConstants.TAG_MONITOR_ID, String.valueOf(monitorId)); diff --git a/collector/src/main/java/org/dromara/hertzbeat/collector/collect/udp/UdpCollectImpl.java b/collector/src/main/java/org/dromara/hertzbeat/collector/collect/udp/UdpCollectImpl.java index f40801b6a7f..c92e83a5e91 100644 --- a/collector/src/main/java/org/dromara/hertzbeat/collector/collect/udp/UdpCollectImpl.java +++ b/collector/src/main/java/org/dromara/hertzbeat/collector/collect/udp/UdpCollectImpl.java @@ -82,7 +82,7 @@ public void collect(CollectRep.MetricsData.Builder builder, long monitorId, Stri } catch (PortUnreachableException portUnreachableException) { String errorMsg = CommonUtil.getMessageFromThrowable(portUnreachableException); log.info(errorMsg); - builder.setCode(CollectRep.Code.UN_AVAILABLE); + builder.setCode(CollectRep.Code.UN_REACHABLE); builder.setMsg("Peer port unreachable"); } catch (Exception exception) { String errorMsg = CommonUtil.getMessageFromThrowable(exception); From d0f46238d2386a7b32a0165bce222797449d7083 Mon Sep 17 00:00:00 2001 From: tomsun28 Date: Wed, 10 Jan 2024 17:56:48 +0800 Subject: [PATCH 2/2] refactor alarm calculate Signed-off-by: tomsun28 --- .../alert/calculate/CalculateAlarm.java | 117 ++++++++++-------- .../dispatch/timer/TimerDispatcher.java | 2 +- .../common/constants/CommonConstants.java | 14 ++- .../store/RealTimeRedisDataStorage.java | 2 +- 4 files changed, 77 insertions(+), 58 deletions(-) diff --git a/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java b/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java index 661def1a964..e7ae171b60e 100644 --- a/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java +++ b/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java @@ -66,13 +66,13 @@ public class CalculateAlarm { /** * The alarm in the process is triggered * 触发中告警信息 - * key - monitorId+alertDefineId 为普通阈值告警 | The alarm is a common threshold alarm + * key - monitorId+alertDefineId+tags 为普通阈值告警 | The alarm is a common threshold alarm * key - monitorId 为任务状态可用性可达性告警 | Indicates the monitoring status availability reachability alarm */ private final Map triggeredAlertMap; /** * The not recover alert - * key - monitorId + alertDefineId + (instance) + * key - monitorId + alertDefineId + tags */ private final Map notRecoveredAlertMap; private final AlerterWorkerPool workerPool; @@ -154,27 +154,28 @@ private void calculate(CollectRep.MetricsData metricsData) { if (StringUtils.isBlank(expr)) { continue; } - if (expr.contains(SYSTEM_VALUE_ROW_COUNT)) { + if (expr.contains(SYSTEM_VALUE_ROW_COUNT) && metricsData.getValuesCount() == 0) { fieldValueMap.put(SYSTEM_VALUE_ROW_COUNT, valueRowCount); try { boolean match = execAlertExpression(fieldValueMap, expr); - if (match) { - // If the threshold rule matches, the number of times the threshold has been triggered is determined and an alarm is triggered - // 阈值规则匹配,判断已触发阈值次数,触发告警 - afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, fieldValueMap, define); - // 若此阈值已被触发,则其它数据行的触发忽略 - continue; - } else { - String monitorAlertKey = String.valueOf(monitorId) + define.getId(); - triggeredAlertMap.remove(monitorAlertKey); - if (define.isRecoverNotice()) { - String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + null; - handleRecoveredAlert(currentTimeMilli, define, expr, notResolvedAlertKey); + try { + if (match) { + // If the threshold rule matches, the number of times the threshold has been triggered is determined and an alarm is triggered + // 阈值规则匹配,判断已触发阈值次数,触发告警 + afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, "", fieldValueMap, define); + // 若此阈值已被触发,则其它数据行的触发忽略 + continue; + } else { + String alarmKey = String.valueOf(monitorId) + define.getId(); + triggeredAlertMap.remove(alarmKey); + if (define.isRecoverNotice()) { + handleRecoveredAlert(currentTimeMilli, define, expr, alarmKey); + } } + } catch (Exception e) { + log.error(e.getMessage(), e); } - } catch (Exception e) { - log.warn(e.getMessage(), e); - } + } catch (Exception ignored) {} } for (CollectRep.ValueRow valueRow : metricsData.getValuesList()) { @@ -183,7 +184,7 @@ private void calculate(CollectRep.MetricsData metricsData) { } fieldValueMap.clear(); fieldValueMap.put(SYSTEM_VALUE_ROW_COUNT, valueRowCount); - StringBuilder instanceBuilder = new StringBuilder(); + StringBuilder tagBuilder = new StringBuilder(); for (int index = 0; index < valueRow.getColumnsList().size(); index++) { String valueStr = valueRow.getColumns(index); if (CommonConstants.NULL_VALUE.equals(valueStr)) { @@ -210,35 +211,36 @@ private void calculate(CollectRep.MetricsData metricsData) { } if (field.getLabel()) { - instanceBuilder.append(valueStr).append("-"); + tagBuilder.append("-").append(valueStr); } } try { boolean match = execAlertExpression(fieldValueMap, expr); - if (match) { - // If the threshold rule matches, the number of times the threshold has been triggered is determined and an alarm is triggered - // 阈值规则匹配,判断已触发阈值次数,触发告警 - afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, fieldValueMap, define); - // 若此阈值已被触发,则其它数据行的触发忽略 - break; - } else { - String monitorAlertKey = String.valueOf(monitorId) + define.getId(); - triggeredAlertMap.remove(monitorAlertKey); - if (define.isRecoverNotice()) { - String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + (instanceBuilder.length() == 0 ? null : instanceBuilder.toString()); - handleRecoveredAlert(currentTimeMilli, define, expr, notResolvedAlertKey); - } + try { + if (match) { + // If the threshold rule matches, the number of times the threshold has been triggered is determined and an alarm is triggered + // 阈值规则匹配,判断已触发阈值次数,触发告警 + afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, tagBuilder.toString(), fieldValueMap, define); + // 若此阈值已被触发,则其它数据行的触发忽略 + break; + } else { + String alarmKey = String.valueOf(monitorId) + define.getId() + tagBuilder; + triggeredAlertMap.remove(alarmKey); + if (define.isRecoverNotice()) { + handleRecoveredAlert(currentTimeMilli, define, expr, alarmKey); + } + } + } catch (Exception e) { + log.error(e.getMessage(), e); } - } catch (Exception e) { - log.warn(e.getMessage(), e); - } + } catch (Exception ignored) {} } } } } - private void handleRecoveredAlert(long currentTimeMilli, AlertDefine define, String expr, String notResolvedAlertKey) { - Alert notResolvedAlert = notRecoveredAlertMap.remove(notResolvedAlertKey); + private void handleRecoveredAlert(long currentTimeMilli, AlertDefine define, String expr, String alarmKey) { + Alert notResolvedAlert = notRecoveredAlertMap.remove(alarmKey); if (notResolvedAlert != null) { // Sending an alarm Restore Map tags = notResolvedAlert.getTags(); @@ -257,9 +259,10 @@ private void handleRecoveredAlert(long currentTimeMilli, AlertDefine define, Str } } - private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, String app, String metrics, Map fieldValueMap, AlertDefine define) { - String monitorAlertKey = String.valueOf(monitorId) + define.getId(); - Alert triggeredAlert = triggeredAlertMap.get(monitorAlertKey); + private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, String app, String metrics, String tagStr, + Map fieldValueMap, AlertDefine define) { + String alarmKey = String.valueOf(monitorId) + define.getId() + tagStr; + Alert triggeredAlert = triggeredAlertMap.get(alarmKey); if (triggeredAlert != null) { int times = triggeredAlert.getTriggerTimes() + 1; triggeredAlert.setTriggerTimes(times); @@ -268,15 +271,14 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri int defineTimes = define.getTimes() == null ? 1 : define.getTimes(); if (times >= defineTimes) { triggeredAlert.setStatus(ALERT_STATUS_CODE_PENDING); - String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + fieldValueMap.get("instance"); - triggeredAlertMap.remove(monitorAlertKey); - notRecoveredAlertMap.put(notResolvedAlertKey, triggeredAlert); + triggeredAlertMap.remove(alarmKey); + notRecoveredAlertMap.put(alarmKey, triggeredAlert); alarmCommonReduce.reduceAndSendAlarm(triggeredAlert.clone()); } } else { - fieldValueMap.put("app", app); - fieldValueMap.put("metrics", metrics); - fieldValueMap.put("metric", define.getField()); + fieldValueMap.put(TAG_MONITOR_APP, app); + fieldValueMap.put(TAG_METRICS, metrics); + fieldValueMap.put(TAG_METRIC, define.getField()); Map tags = new HashMap<>(8); tags.put(CommonConstants.TAG_MONITOR_ID, String.valueOf(monitorId)); tags.put(CommonConstants.TAG_MONITOR_APP, app); @@ -301,27 +303,34 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri int defineTimes = define.getTimes() == null ? 1 : define.getTimes(); if (1 >= defineTimes) { alert.setStatus(ALERT_STATUS_CODE_PENDING); - String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + fieldValueMap.get("instance"); - notRecoveredAlertMap.put(notResolvedAlertKey, alert); + notRecoveredAlertMap.put(alarmKey, alert); alarmCommonReduce.reduceAndSendAlarm(alert.clone()); } else { - triggeredAlertMap.put(monitorAlertKey, alert); + triggeredAlertMap.put(alarmKey, alert); } } } private boolean execAlertExpression(Map fieldValueMap, String expr) { - Boolean match = false; + Boolean match; try { Expression expression = AviatorEvaluator.compile(expr, true); + expression.getVariableNames().forEach(variable -> { + if (!fieldValueMap.containsKey(variable)) { + throw new ExpressionRuntimeException("metrics value not contains expr field: " + variable); + } + }); match = (Boolean) expression.execute(fieldValueMap); } catch (CompileExpressionErrorException | ExpressionSyntaxErrorException compileException) { log.error("Alert Define Rule: {} Compile Error: {}.", expr, compileException.getMessage()); + throw compileException; } catch (ExpressionRuntimeException expressionRuntimeException) { log.error("Alert Define Rule: {} Run Error: {}.", expr, expressionRuntimeException.getMessage()); + throw expressionRuntimeException; } catch (Exception e) { - log.error("Alert Define Rule: {} Run Error: {}.", e, e.getMessage()); + log.error("Alert Define Rule: {} Unknown Error: {}.", expr, e.getMessage()); + throw e; } return match != null && match; } @@ -342,8 +351,8 @@ private void handlerAvailableMetrics(long monitorId, String app, CollectRep.Metr tags.put(CommonConstants.TAG_MONITOR_ID, String.valueOf(monitorId)); tags.put(CommonConstants.TAG_MONITOR_APP, app); tags.put(CommonConstants.TAG_THRESHOLD_ID, String.valueOf(avaAlertDefine.getId())); - tags.put("metrics", CommonConstants.AVAILABILITY); - tags.put("code", metricsData.getCode().name()); + tags.put(TAG_METRICS, CommonConstants.AVAILABILITY); + tags.put(TAG_CODE, metricsData.getCode().name()); Map valueMap = tags.entrySet().stream() .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); diff --git a/collector/src/main/java/org/dromara/hertzbeat/collector/dispatch/timer/TimerDispatcher.java b/collector/src/main/java/org/dromara/hertzbeat/collector/dispatch/timer/TimerDispatcher.java index 6a34b45b8f9..750a1e0e806 100644 --- a/collector/src/main/java/org/dromara/hertzbeat/collector/dispatch/timer/TimerDispatcher.java +++ b/collector/src/main/java/org/dromara/hertzbeat/collector/dispatch/timer/TimerDispatcher.java @@ -67,7 +67,7 @@ public TimerDispatcher() { ret.setDaemon(true); return ret; }, 1, TimeUnit.SECONDS, 512); - this.currentCyclicTaskMap = new ConcurrentHashMap<>(64); + this.currentCyclicTaskMap = new ConcurrentHashMap<>(8); this.currentTempTaskMap = new ConcurrentHashMap<>(8); this.eventListeners = new ConcurrentHashMap<>(8); this.started = new AtomicBoolean(true); diff --git a/common/src/main/java/org/dromara/hertzbeat/common/constants/CommonConstants.java b/common/src/main/java/org/dromara/hertzbeat/common/constants/CommonConstants.java index 00e6420fadd..02f43b4d39c 100644 --- a/common/src/main/java/org/dromara/hertzbeat/common/constants/CommonConstants.java +++ b/common/src/main/java/org/dromara/hertzbeat/common/constants/CommonConstants.java @@ -250,9 +250,19 @@ public interface CommonConstants { String TAG_MONITOR_APP = "app"; /** - * 内有标签: alarm type + * 内有标签: metrics */ - String TAG_ALARM_TYPE = "type"; + String TAG_METRICS = "metrics"; + + /** + * 内有标签: metric + */ + String TAG_METRIC = "metric"; + + /** + * 内有标签: code + */ + String TAG_CODE = "code"; /** * notice_period type 类型字段, 每日类型 diff --git a/warehouse/src/main/java/org/dromara/hertzbeat/warehouse/store/RealTimeRedisDataStorage.java b/warehouse/src/main/java/org/dromara/hertzbeat/warehouse/store/RealTimeRedisDataStorage.java index 611732c5ac4..3b8190177ea 100644 --- a/warehouse/src/main/java/org/dromara/hertzbeat/warehouse/store/RealTimeRedisDataStorage.java +++ b/warehouse/src/main/java/org/dromara/hertzbeat/warehouse/store/RealTimeRedisDataStorage.java @@ -78,7 +78,7 @@ public List getCurrentMetricsData(@NonNull Long monitorI public void saveData(CollectRep.MetricsData metricsData) { String key = String.valueOf(metricsData.getId()); String hashKey = metricsData.getMetrics(); - if (metricsData.getCode() != CollectRep.Code.SUCCESS) { + if (metricsData.getCode() != CollectRep.Code.SUCCESS || !isServerAvailable()) { return; } if (metricsData.getValuesList().isEmpty()) {