Skip to content

Commit

Permalink
fix: alert
Browse files Browse the repository at this point in the history
  • Loading branch information
Amuhar committed Jan 25, 2024
1 parent 92538d3 commit c46c3d6
Show file tree
Hide file tree
Showing 12 changed files with 135 additions and 80 deletions.
13 changes: 13 additions & 0 deletions alerts/keys-api-outdated-keys.empty-db.rule.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
groups:
- name: Keys API. Initial keys update check
rules:
- alert: KeysApiOutdatedKeysEmptyDB
expr: |
(time() - process_start_time_seconds{}) >= 15 * 60 and (lido_keys_api_last_update_timestamp{} == 0)
labels:
severity: critical
service: keys_api
app_team: tooling
annotations:
summary: Keys are outdated
description: 'Initial update took more than {{ $value | humanizeDuration }}'
36 changes: 36 additions & 0 deletions alerts/keys-api-outdated-keys.empty-db.test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
rule_files:
- 'keys-api-outdated-keys.empty-db.rule.yml'

evaluation_interval: 1m

tests:
# Test Case 1: Keys API not updated since process start
- interval: 1m
input_series:
- series: 'process_start_time_seconds{}'
values: '0x15'
- series: 'lido_keys_api_last_update_timestamp{}'
values: '0x15'
alert_rule_test:
- eval_time: 15m
alertname: KeysApiOutdatedKeysEmptyDB
exp_alerts:
- exp_labels:
severity: 'critical'
service: 'keys_api'
app_team: 'tooling'
exp_annotations:
summary: 'Keys are outdated'
description: 'Initial update took more than 15m 0s'

#Test Case 2: Keys API updated after process start
- interval: 1m
input_series:
- series: 'process_start_time_seconds{}'
values: '0x15'
- series: 'lido_keys_api_last_update_timestamp{}'
values: '0x12 0 0 1'
alert_rule_test:
- eval_time: 15m
alertname: KeysApiOutdatedKeysEmptyDB
exp_alerts: []
13 changes: 13 additions & 0 deletions alerts/keys-api-outdated-keys.non-empty-db.rule.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
groups:
- name: Keys API. Update on non empty db
rules:
- alert: KeysApiOutdatedKeysNonEmptyDB
expr: |
(time() - lido_keys_api_last_update_timestamp{}) >= 10*60 and lido_keys_api_last_update_timestamp{} > 0
labels:
severity: critical
service: keys_api
app_team: tooling
annotations:
summary: Keys are outdated
description: 'Keys were not updated for more than {{ $value | humanizeDuration }}'
32 changes: 32 additions & 0 deletions alerts/keys-api-outdated-keys.non-empty-db.test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
rule_files:
- 'keys-api-outdated-keys.non-empty-db.rule.yml'

evaluation_interval: 1m

tests:
# Test Case 1: Keys API not updated for more than 10 minutes
- interval: 1m
input_series:
- series: 'lido_keys_api_last_update_timestamp{}'
values: '0x10 60'
alert_rule_test:
- eval_time: 11m
alertname: KeysApiOutdatedKeysNonEmptyDB
exp_alerts:
- exp_labels:
severity: 'critical'
service: 'keys_api'
app_team: 'tooling'
exp_annotations:
summary: 'Keys are outdated'
description: 'Keys were not updated for more than 10m 0s'

# Test Case 2: Keys API updated during 10 minutes
- interval: 1m
input_series:
- series: 'lido_keys_api_last_update_timestamp{}'
values: '60x10 61'
alert_rule_test:
- eval_time: 11m
alertname: KeysApiOutdatedKeysNonEmptyDB
exp_alerts: []
13 changes: 0 additions & 13 deletions alerts/keys-api-outdated-keys.rule.yml

This file was deleted.

49 changes: 0 additions & 49 deletions alerts/keys-api-outdated-keys.test.yml

This file was deleted.

4 changes: 2 additions & 2 deletions alerts/keys-api-outdated-validators.rule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ groups:
- name: Keys API. Validators
rules:
- alert: KeysApiOutdatedValidators
expr: validators_registry_enabled{} == 1 AND (time() - lido_keys_api_validators_registry_last_update_block_timestamp{} >= 3600)
expr: (time() - lido_keys_api_validators_registry_last_update_block_timestamp{} >= 3600) AND validators_registry_enabled{} == 1
labels:
severity: critical
service: keys_api
app_team: tooling
annotations:
summary: Validators are outdated
description: 'Validators were not updated for more than 60 minutes'
description: 'Validators were not updated for more than {{ $value | humanizeDuration }}'
4 changes: 2 additions & 2 deletions alerts/keys-api-outdated-validators.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ tests:
app_team: tooling
exp_annotations:
summary: Validators are outdated
description: Validators were not updated for more than 60 minutes
description: Validators were not updated for more than 1h 0m 0s
- eval_time: 75m
alertname: KeysApiOutdatedValidators
exp_alerts:
Expand All @@ -42,7 +42,7 @@ tests:
app_team: tooling
exp_annotations:
summary: Validators are outdated
description: Validators were not updated for more than 60 minutes
description: Validators were not updated for more than 1h 0m 0s

# Actual validators list
- interval: 15m
Expand Down
11 changes: 11 additions & 0 deletions docker-compose.metrics.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ services:
- 9090:9090
volumes:
- ./prometheus/:/etc/prometheus/
- ./alerts/:/etc/prometheus/alerts/
command: --config.file=/etc/prometheus/prometheus.yml --enable-feature=remote-write-receiver

keys_api_grafana:
Expand All @@ -20,3 +21,13 @@ services:
- ./grafana/datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
depends_on:
- keys_api_prometheus

alertmanager:
image: prom/alertmanager:latest
ports:
- 9093:9093
restart: always
volumes:
- ./prometheus/:/etc/alertmanager/
depends_on:
- keys_api_prometheus
9 changes: 9 additions & 0 deletions prometheus/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,12 @@ scrape_configs:
- targets: ['docker.for.mac.host.internal:3000']
remote_write:
- url: https://localhost/api/v1/write

rule_files:
- 'alerts/keys-api-outdated-keys.empty-db.rule.yml'
- 'alerts/keys-api-outdated-keys.non-empty-db.rule.yml'

alerting:
alertmanagers:
- static_configs:
- targets: ['docker.for.mac.host.internal:9093']
29 changes: 16 additions & 13 deletions src/jobs/keys-update/keys-update.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,22 @@ export class KeysUpdateService {
protected lastTimestampSec: number | undefined = undefined;
protected lastBlockNumber: number | undefined = undefined;

// name of interval for updating keys
// Name of interval for updating keys
public UPDATE_KEYS_JOB_NAME = 'SRModulesKeysUpdate';
// timeout for update keys
// if during 30 minutes nothing happen we will exit
// Timeout for update keys
// If during 30 minutes nothing happen we will exit
UPDATE_KEYS_TIMEOUT_MS = 30 * 60 * 1000; // 30 minutes
updateDeadlineTimer: undefined | NodeJS.Timeout = undefined;

/**
* Initializes the job
*/
public async initialize(): Promise<void> {
// at first start timer for checking update
// if timer isn't cleared in 30 minutes period, we will consider it as nodejs frizzing and exit
// Set metrics based on the values from the database
this.updateMetrics();

// Initially, start the timer to check whether an update has occurred or not
// If timer isn't cleared in 30 minutes period, we will consider it as nodejs frizzing and exit
this.checkKeysUpdateTimeout();
await this.updateKeys().catch((error) => this.logger.error(error));

Expand All @@ -70,9 +73,9 @@ export class KeysUpdateService {

private checkKeysUpdateTimeout() {
const currTimestampSec = new Date().getTime() / 1000;
// currTimestampSec - this.lastTimestampSec - time since last update in seconds
// currTimestampSec - this.lastTimestampSec - Time since last update in seconds
// this.UPDATE_KEYS_TIMEOUT_MS / 1000 - timeout in seconds
// so if time since last update is less than timeout, this means keys are updated
// So if time since last update is less than timeout, this means keys are updated
const isUpdated =
this.lastTimestampSec && currTimestampSec - this.lastTimestampSec < this.UPDATE_KEYS_TIMEOUT_MS / 1000;

Expand Down Expand Up @@ -170,20 +173,20 @@ export class KeysUpdateService {

this.prometheusService.registryNumberOfKeysBySRModuleAndOperator.reset();

for (const module of stakingModules) {
const moduleInstance = this.stakingRouterService.getStakingRouterModuleImpl(module.type);
for (const stakingModule of stakingModules) {
const moduleInstance = this.stakingRouterService.getStakingRouterModuleImpl(stakingModule.type);

// update nonce metric
this.prometheusService.registryNonce.set({ srModuleId: module.moduleId }, module.nonce);
this.prometheusService.registryNonce.set({ srModuleId: stakingModule.moduleId }, stakingModule.nonce);

// get operators
const operators = await moduleInstance.getOperators(module.stakingModuleAddress);
const operators = await moduleInstance.getOperators(stakingModule.stakingModuleAddress);

operators.forEach((operator) => {
this.prometheusService.registryNumberOfKeysBySRModuleAndOperator.set(
{
operator: operator.index,
srModuleId: module.moduleId,
srModuleId: stakingModule.moduleId,
used: 'true',
},
operator.usedSigningKeys,
Expand All @@ -192,7 +195,7 @@ export class KeysUpdateService {
this.prometheusService.registryNumberOfKeysBySRModuleAndOperator.set(
{
operator: operator.index,
srModuleId: module.moduleId,
srModuleId: stakingModule.moduleId,
used: 'false',
},
operator.totalSigningKeys - operator.usedSigningKeys,
Expand Down
2 changes: 1 addition & 1 deletion src/jobs/keys-update/staking-module-updater.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ export class StakingModuleUpdaterService {
continue;
}

this.logger.log('No changes have been detected in the module, updating is not required', {
this.logger.log('No changes have been detected in the module, update is not required', {
stakingModuleAddress,
currentBlockHash,
});
Expand Down

0 comments on commit c46c3d6

Please sign in to comment.