Skip to content

Commit

Permalink
support enable alert threshold auto resolved notice (#1185)
Browse files Browse the repository at this point in the history
Signed-off-by: tomsun28 <tomsun28@outlook.com>
  • Loading branch information
tomsun28 committed Jan 16, 2024
1 parent ffdd16d commit 446102f
Show file tree
Hide file tree
Showing 15 changed files with 214 additions and 149 deletions.

Large diffs are not rendered by default.

8 changes: 3 additions & 5 deletions alerter/src/main/resources/alerter_en_US.properties
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

alerter.availability.emergency = Monitoring Availability Emergency Alert
alerter.reachability.emergency = Monitoring Reachability Emergency Alert
alerter.availability.resolved = Availability Alert Resolved, Monitor Status Normal Now
alerter.reachability.resolved = Reachability Alert Resolved, Monitor Status Normal Now
alerter.availability.recover = Availability Alert Resolved, Monitor Status Normal Now
alerter.alarm.recover = Alert Resolved Notice
alerter.notify.title = HertzBeat Alert Notify
alerter.notify.target = Monitor Target
alerter.notify.monitorId = Monitor ID
Expand All @@ -27,4 +25,4 @@ alerter.notify.content = Alert Content
alerter.notify.console = Console Login
alerter.priority.0 = Emergency Alert
alerter.priority.1 = Critical Alert
alerter.priority.2 = Warning Alert
alerter.priority.2 = Warning Alert
8 changes: 3 additions & 5 deletions alerter/src/main/resources/alerter_zh_CN.properties
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

alerter.availability.emergency = 监控紧急可用性告警
alerter.reachability.emergency = 监控紧急可达性告警
alerter.availability.resolved = 可用性告警恢复通知, 监控状态已恢复正常
alerter.reachability.resolved = 可达性告警恢复通知, 监控状态已恢复正常
alerter.availability.recover = 可用性告警恢复通知, 监控状态已恢复正常
alerter.alarm.recover = 告警恢复通知
alerter.notify.title = HertzBeat告警通知
alerter.notify.target = 告警目标对象
alerter.notify.monitorId = 所属监控ID
Expand All @@ -27,4 +25,4 @@ alerter.notify.content = 内容详情
alerter.notify.console = 登入控制台
alerter.priority.0 = 紧急告警
alerter.priority.1 = 严重告警
alerter.priority.2 = 警告告警
alerter.priority.2 = 警告告警
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ public class AlertDefine {

@Schema(title = "告警阈值开关", example = "true", accessMode = READ_WRITE)
private boolean enable = true;

@Schema(title = "Is send alarm recover notice | 是否发送告警恢复通知", example = "false", accessMode = READ_WRITE)
@Column(columnDefinition = "boolean default false")
private boolean recoverNotice = false;

@Schema(title = "告警通知内容模版", example = "linux {monitor_name}: {monitor_id} cpu usage high",
accessMode = READ_WRITE)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package org.dromara.hertzbeat.common.support.event;

import org.springframework.context.ApplicationEvent;

/**
* the event for system config change
* @author tom
*/
public class MonitorDeletedEvent extends ApplicationEvent {

/**
* monitoring id
*/
private final Long monitorId;

public MonitorDeletedEvent(Object source, Long monitorId) {
super(source);
this.monitorId = monitorId;
}

public Long getMonitorId() {
return monitorId;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
/**
* 报警持久化 - 落地到数据库
* Alarm data persistence - landing in the database
* @author <a href="mailto:Musk.Chen@fanruan.com">Musk.Chen</a>
*
* @author <a href="mailto:Musk.Chen@fanruan.com">Musk.Chen</a>
*/
@Component
@RequiredArgsConstructor
Expand All @@ -43,7 +43,7 @@ final class DbAlertStoreHandlerImpl implements AlertStoreHandler {
private final MonitorService monitorService;

private final AlertService alertService;

@Override
public void store(Alert alert) {
Map<String, String> tags = alert.getTags();
Expand All @@ -60,19 +60,17 @@ public void store(Alert alert) {
// 当监控未管理时 忽略静默其告警信息
return;
}
if (monitor.getStatus() == CommonConstants.AVAILABLE_CODE) {
if (CommonConstants.AVAILABILITY.equals(alert.getTarget())) {
if (CommonConstants.AVAILABILITY.equals(alert.getTarget())) {
if (alert.getStatus() == CommonConstants.ALERT_STATUS_CODE_PENDING && monitor.getStatus() == CommonConstants.AVAILABLE_CODE) {
// Availability Alarm Need to change the monitoring status to unavailable
// 可用性告警 需变更监控状态为不可用
monitorService.updateMonitorStatus(monitor.getId(), CommonConstants.UN_AVAILABLE_CODE);
}
} else {
// If the alarm is restored, the monitoring state needs to be restored
// 若是恢复告警 需对监控状态进行恢复
if (alert.getStatus() == CommonConstants.ALERT_STATUS_CODE_RESTORED) {
} else if (alert.getStatus() == CommonConstants.ALERT_STATUS_CODE_RESTORED && monitor.getStatus() == CommonConstants.UN_AVAILABLE_CODE) {
// If the alarm is restored, the monitoring state needs to be restored
// 若是恢复告警 需对监控状态进行恢复
monitorService.updateMonitorStatus(monitorId, CommonConstants.AVAILABLE_CODE);
}
}
}
} else {
log.debug("store extern alert content: {}.", alert);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public void run(String... args) throws Exception {
.preset(true)
.times(1)
.enable(true)
.recoverNotice(true)
.priority(CommonConstants.ALERT_PRIORITY_CODE_EMERGENCY)
.template("${app} monitoring availability alert, code is ${code}")
.build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import com.fasterxml.jackson.core.type.TypeReference;
import com.google.gson.Gson;
import lombok.extern.slf4j.Slf4j;
import org.dromara.hertzbeat.alert.calculate.CalculateAlarm;
import org.dromara.hertzbeat.alert.dao.AlertDefineBindDao;
import org.dromara.hertzbeat.common.constants.CommonConstants;
import org.dromara.hertzbeat.common.entity.job.Configmap;
Expand All @@ -33,6 +32,7 @@
import org.dromara.hertzbeat.common.entity.manager.ParamDefine;
import org.dromara.hertzbeat.common.entity.manager.Tag;
import org.dromara.hertzbeat.common.entity.message.CollectRep;
import org.dromara.hertzbeat.common.support.event.MonitorDeletedEvent;
import org.dromara.hertzbeat.common.util.*;
import org.dromara.hertzbeat.manager.dao.CollectorDao;
import org.dromara.hertzbeat.manager.dao.CollectorMonitorBindDao;
Expand All @@ -50,6 +50,7 @@
import org.dromara.hertzbeat.manager.support.exception.MonitorDetectException;
import org.dromara.hertzbeat.manager.support.exception.MonitorMetricsException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.jpa.domain.Specification;
Expand Down Expand Up @@ -112,9 +113,9 @@ public class MonitorServiceImpl implements MonitorService {

@Autowired
private TagMonitorBindDao tagMonitorBindDao;

@Autowired
private CalculateAlarm calculateAlarm;
private ApplicationContext applicationContext;

private final Map<String, ImExportService> imExportServiceMap = new HashMap<>();

Expand Down Expand Up @@ -545,7 +546,6 @@ public void modifyMonitor(Monitor monitor, List<Param> params, String collector)
if (params != null) {
paramDao.saveAll(params);
}
calculateAlarm.triggeredAlertMap.remove(String.valueOf(monitorId));
} catch (Exception e) {
log.error(e.getMessage(), e);
// Repository brushing abnormally cancels the previously delivered task
Expand All @@ -568,7 +568,7 @@ public void deleteMonitor(long id) throws RuntimeException {
tagMonitorBindDao.deleteTagMonitorBindsByMonitorId(id);
alertDefineBindDao.deleteAlertDefineMonitorBindsByMonitorIdEquals(id);
collectJobScheduling.cancelAsyncCollectJob(monitor.getJobId());
calculateAlarm.triggeredAlertMap.remove(String.valueOf(monitor.getId()));
applicationContext.publishEvent(new MonitorDeletedEvent(applicationContext, monitor.getId()));
}
}

Expand All @@ -586,7 +586,7 @@ public void deleteMonitors(Set<Long> ids) throws RuntimeException {
// delete tag 删除监控对应的标签
tagService.deleteMonitorSystemTags(monitor);
collectJobScheduling.cancelAsyncCollectJob(monitor.getJobId());
calculateAlarm.triggeredAlertMap.remove(String.valueOf(monitor.getId()));
applicationContext.publishEvent(new MonitorDeletedEvent(applicationContext, monitor.getId()));
}
}
}
Expand Down Expand Up @@ -674,7 +674,7 @@ public void enableManageMonitors(HashSet<Long> ids) {
// Issue collection tasks 下发采集任务
long newJobId = collectJobScheduling.addAsyncCollectJob(appDefine);
monitor.setJobId(newJobId);
calculateAlarm.triggeredAlertMap.remove(String.valueOf(monitor.getId()));
applicationContext.publishEvent(new MonitorDeletedEvent(applicationContext, monitor.getId()));
}
monitorDao.saveAll(unManagedMonitors);
}
Expand Down Expand Up @@ -773,7 +773,6 @@ public void updateAppCollectJob(Job job) {
// 下发采集任务
long newJobId = collectJobScheduling.addAsyncCollectJob(appDefine);
monitor.setJobId(newJobId);
calculateAlarm.triggeredAlertMap.remove(String.valueOf(monitor.getId()));
monitorDao.save(monitor);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.dromara.hertzbeat.manager.service;

import org.dromara.hertzbeat.alert.calculate.CalculateAlarm;
import org.dromara.hertzbeat.alert.dao.AlertDefineBindDao;
import org.dromara.hertzbeat.common.entity.alerter.Alert;
import org.dromara.hertzbeat.common.entity.job.Job;
Expand Down Expand Up @@ -32,6 +31,7 @@
import org.mockito.InjectMocks;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.springframework.context.ApplicationContext;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.jpa.domain.Specification;
Expand Down Expand Up @@ -96,7 +96,7 @@ class MonitorServiceTest {
private CollectorMonitorBindDao collectorMonitorBindDao;

@Mock
private CalculateAlarm calculateAlarm;
private ApplicationContext applicationContext;

@Mock
Map<String, Alert> triggeredAlertMap = spy(new HashMap<>());
Expand All @@ -105,9 +105,7 @@ class MonitorServiceTest {
* 属性无法直接mock,测试执行前-手动赋值
*/
@BeforeEach
public void setUp() {
calculateAlarm.triggeredAlertMap = triggeredAlertMap;
}
public void setUp() {}

@Test
void detectMonitorEmpty() {
Expand Down
31 changes: 16 additions & 15 deletions script/sql/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -125,21 +125,22 @@ CREATE TABLE hzb_tag_monitor_bind
DROP TABLE IF EXISTS hzb_alert_define ;
CREATE TABLE hzb_alert_define
(
id bigint not null auto_increment comment '告警定义ID',
app varchar(100) not null comment '配置告警的监控类型:linux,mysql,jvm...',
metric varchar(100) not null comment '配置告警的指标集合:cpu,memory,info...',
field varchar(100) comment '配置告警的指标:usage,cores...',
preset boolean not null default false comment '是否是全局默认告警,是则所有此类型监控默认关联此告警',
expr varchar(255) comment '告警触发条件表达式',
priority tinyint not null default 0 comment '告警级别 0:高-emergency-紧急告警-红色 1:中-critical-严重告警-橙色 2:低-warning-警告告警-黄色',
times int not null default 1 comment '触发次数,即达到触发阈值次数要求后才算触发告警',
tags varchar(4000) comment '附加告警标签(status:success,env:prod)',
enable boolean not null default true comment '告警阈值开关',
template varchar(255) not null comment '告警通知模板内容',
creator varchar(100) comment '创建者',
modifier varchar(100) comment '最新修改者',
gmt_create timestamp default current_timestamp comment 'create time',
gmt_update datetime default current_timestamp on update current_timestamp comment 'update time',
id bigint not null auto_increment comment '告警定义ID',
app varchar(100) not null comment '配置告警的监控类型:linux,mysql,jvm...',
metric varchar(100) not null comment '配置告警的指标集合:cpu,memory,info...',
field varchar(100) comment '配置告警的指标:usage,cores...',
preset boolean not null default false comment '是否是全局默认告警,是则所有此类型监控默认关联此告警',
expr varchar(255) comment '告警触发条件表达式',
priority tinyint not null default 0 comment '告警级别 0:高-emergency-紧急告警-红色 1:中-critical-严重告警-橙色 2:低-warning-警告告警-黄色',
times int not null default 1 comment '触发次数,即达到触发阈值次数要求后才算触发告警',
tags varchar(4000) comment '附加告警标签(status:success,env:prod)',
enable boolean not null default true comment '告警阈值开关',
template varchar(255) not null comment '告警通知模板内容',
recover_notice boolean not null default false comment 'Is send alarm recovered notice | 是否发送告警恢复通知',
creator varchar(100) comment '创建者',
modifier varchar(100) comment '最新修改者',
gmt_create timestamp default current_timestamp comment 'create time',
gmt_update datetime default current_timestamp on update current_timestamp comment 'update time',
primary key (id)
) ENGINE = InnoDB DEFAULT CHARSET=utf8mb4;

Expand Down
1 change: 1 addition & 0 deletions web-app/src/app/pojo/AlertDefine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export class AlertDefine {
times: number = 3;
tags!: TagItem[];
enable: boolean = true;
recoverNotice: boolean = false;
template!: string;
creator!: string;
modifier!: string;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,19 @@
<nz-switch [(ngModel)]="define.preset" name="preset" id="preset"></nz-switch>
</nz-form-control>
</nz-form-item>
<nz-form-item>
<nz-form-label nzSpan="7" nzFor="recoverNotice" [nzTooltipTitle]="'alert.setting.recover-notice.tip' | i18n">
{{ 'alert.setting.recover-notice' | i18n }}
</nz-form-label>
<nz-form-control nzSpan="12">
<nz-switch
[(ngModel)]="define.recoverNotice"
[ngModelOptions]="{ standalone: true }"
name="recoverNotice"
id="recoverNotice"
></nz-switch>
</nz-form-control>
</nz-form-item>
<nz-form-item>
<nz-form-label nzSpan="7" nzRequired="true" nzFor="enable" [nzTooltipTitle]="'alert.setting.enable.tip' | i18n">
{{ 'alert.setting.enable' | i18n }}
Expand Down
2 changes: 2 additions & 0 deletions web-app/src/assets/i18n/en-US.json
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@
"alert.setting.default.tip": "Whether this alarm threshold configuration applies to all this type of monitoring globally",
"alert.setting.enable": "Enable Alert",
"alert.setting.enable.tip": "This alarm threshold configuration is enabled or disabled",
"alert.setting.recover-notice": "Recover Notice",
"alert.setting.recover-notice.tip": "Whether to send the corresponding recovered notification when the alarm is resolved under this threshold rule",
"alert.setting.connect": "Alert Associate Monitors",
"alert.setting.connect.left": "No Associate",
"alert.setting.connect.right": "Associated",
Expand Down
2 changes: 2 additions & 0 deletions web-app/src/assets/i18n/zh-CN.json
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@
"alert.setting.default.tip": "此告警阈值配置是否应用于全局所有此类型监控",
"alert.setting.enable": "启用告警",
"alert.setting.enable.tip": "此告警阈值配置开启生效或关闭",
"alert.setting.recover-notice": "恢复通知",
"alert.setting.recover-notice.tip": "是否在此阈值规则下告警恢复时发送对应的恢复通知",
"alert.setting.connect": "告警定义关联监控",
"alert.setting.connect.left": "未关联监控",
"alert.setting.connect.right": "已关联监控",
Expand Down
2 changes: 2 additions & 0 deletions web-app/src/assets/i18n/zh-TW.json
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@
"alert.setting.default.tip": "此告警阈值配置是否應用于全局所有此類型監控",
"alert.setting.enable": "啓用告警",
"alert.setting.enable.tip": "此告警阈值配置開啓生效或關閉",
"alert.setting.recover-notice": "恢復通知",
"alert.setting.recover-notice.tip": "是否在此閾值規則下告警恢復時發送對應的恢復通知",
"alert.setting.connect": "告警定義關聯監控",
"alert.setting.connect.left": "未關聯監控",
"alert.setting.connect.right": "已關聯監控",
Expand Down

0 comments on commit 446102f

Please sign in to comment.