-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Platform] New Health check for Replication status in 2dc setup #4843
Summary: Add ability to configure alerts to be triggered based off of prometheus queries. Also add UI for replication lag alert. Configurable to be turned on/off at runtime as well as have the threshold (in ms) configurable by the user. Default is 3m. Test Plan: Use a proxy metric (cpu_utime) to test the end-to-end flow including UI + triggering alert + resolving alert (through flipping the ">" to "<" in the query) + verifying corresponding emails are sent if the alert is triggered + resolved/not sent if the setting is disabled in the UI. ``` ---------- MESSAGE FOLLOWS ---------- Content-Type: multipart/alternative; boundary="===============1491229132144892902==" MIME-Version: 1.0 Subject: Yugabyte Platform Alert - <[admin][admin]> From: To: daniel@yugabyte.com X-Peer: 127.0.0.1 --===============1491229132144892902== Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Replication Lag Alert for daniel-test-backup-alert-oct-20-4 is firing. --===============1491229132144892902==-- ------------ END MESSAGE ------------ ``` {F14427} {F14428} {F14429} Reviewers: arnav, wesley, spotachev, sanketh, andrew Reviewed By: sanketh, andrew Subscribers: andrew, sshevchenko, jenkins-bot Differential Revision: https://phabricator.dev.yugabyte.com/D9726
- Loading branch information
Showing
22 changed files
with
996 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
127 changes: 127 additions & 0 deletions
127
managed/src/main/java/com/yugabyte/yw/commissioner/QueryAlerts.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
/* | ||
* Copyright 2020 YugaByte, Inc. and Contributors | ||
* | ||
* Licensed under the Polyform Free Trial License 1.0.0 (the "License"); you | ||
* may not use this file except in compliance with the License. You | ||
* may obtain a copy of the License at | ||
* | ||
* https://github.com/YugaByte/yugabyte-db/blob/master/licenses/POLYFORM-FREE-TRIAL-LICENSE-1.0.0.txt | ||
*/ | ||
|
||
package com.yugabyte.yw.commissioner; | ||
|
||
import akka.actor.ActorSystem; | ||
import com.google.common.annotations.VisibleForTesting; | ||
import com.google.inject.Inject; | ||
import com.google.inject.Singleton; | ||
import com.yugabyte.yw.common.AlertManager; | ||
import com.yugabyte.yw.metrics.MetricQueryHelper; | ||
import com.yugabyte.yw.models.*; | ||
|
||
import java.util.*; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.concurrent.atomic.AtomicBoolean; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
import scala.concurrent.ExecutionContext; | ||
import scala.concurrent.duration.Duration; | ||
|
||
@Singleton | ||
public class QueryAlerts { | ||
public static final Logger LOG = LoggerFactory.getLogger(QueryAlerts.class); | ||
|
||
private AtomicBoolean running = new AtomicBoolean(false); | ||
|
||
private final ActorSystem actorSystem; | ||
|
||
private final ExecutionContext executionContext; | ||
|
||
private final MetricQueryHelper queryHelper; | ||
|
||
private final AlertManager alertManager; | ||
|
||
private final int YB_QUERY_ALERTS_INTERVAL = 1; | ||
|
||
@Inject | ||
public QueryAlerts( | ||
ExecutionContext executionContext, | ||
ActorSystem actorSystem, | ||
AlertManager alertManager, | ||
MetricQueryHelper queryHelper | ||
) { | ||
this.actorSystem = actorSystem; | ||
this.executionContext = executionContext; | ||
this.queryHelper = queryHelper; | ||
this.alertManager = alertManager; | ||
this.initialize(); | ||
} | ||
|
||
public void setRunningState(AtomicBoolean state) { | ||
this.running = state; | ||
} | ||
|
||
private void initialize() { | ||
this.actorSystem.scheduler().schedule( | ||
Duration.create(0, TimeUnit.MINUTES), | ||
Duration.create(YB_QUERY_ALERTS_INTERVAL, TimeUnit.MINUTES), | ||
this::scheduleRunner, | ||
this.executionContext | ||
); | ||
} | ||
|
||
public Set<Alert> processAlertDefinitions(UUID customerUUID) { | ||
Set<Alert> alertsStillActive = new HashSet<>(); | ||
AlertDefinition.listActive(customerUUID).forEach(definition -> { | ||
if (!queryHelper.queryDirect(definition.query).isEmpty()) { | ||
Universe universe = Universe.get(definition.universeUUID); | ||
Alert existingAlert = Alert.getActiveCustomerAlert(customerUUID, definition.uuid); | ||
// Create an alert to activate if it doesn't exist already | ||
if (existingAlert == null) { | ||
Alert.create( | ||
customerUUID, | ||
definition.universeUUID, | ||
Alert.TargetType.UniverseType, | ||
"CUSTOMER_ALERT", | ||
"Error", | ||
String.format("%s for %s is firing", definition.name, universe.name), | ||
definition.isActive, | ||
definition.uuid | ||
); | ||
} else { | ||
alertsStillActive.add(existingAlert); | ||
} | ||
} | ||
}); | ||
|
||
return alertsStillActive; | ||
} | ||
|
||
@VisibleForTesting | ||
void scheduleRunner() { | ||
if (running.compareAndSet(false, true)) { | ||
try { | ||
Set<Alert> alertsStillActive = new HashSet<>(); | ||
|
||
// Pick up all alerts still active + create new alerts | ||
Customer.getAll().forEach(c -> alertsStillActive.addAll(processAlertDefinitions(c.uuid))); | ||
|
||
// Pick up all created alerts that are waiting to be activated | ||
Set<Alert> alertsToTransition = new HashSet<>(Alert.listToActivate()); | ||
|
||
// Pick up all alerts that should be resolved internally but are currently active | ||
Customer.getAll().forEach(c -> | ||
Alert.listActiveCustomerAlerts(c.uuid).forEach(alert -> { | ||
if (!alertsStillActive.contains(alert)) alertsToTransition.add(alert); | ||
})); | ||
|
||
// Trigger alert transitions | ||
alertsToTransition.forEach(alertManager::transitionAlert); | ||
} catch (Exception e) { | ||
LOG.error("Error querying for alerts", e); | ||
} | ||
|
||
running.set(false); | ||
} | ||
} | ||
} |
114 changes: 114 additions & 0 deletions
114
managed/src/main/java/com/yugabyte/yw/common/AlertManager.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
/* | ||
* Copyright 2020 YugaByte, Inc. and Contributors | ||
* | ||
* Licensed under the Polyform Free Trial License 1.0.0 (the "License"); you | ||
* may not use this file except in compliance with the License. You | ||
* may obtain a copy of the License at | ||
* | ||
* https://github.com/YugaByte/yugabyte-db/blob/master/licenses/POLYFORM-FREE-TRIAL-LICENSE-1.0.0.txt | ||
*/ | ||
|
||
package com.yugabyte.yw.common; | ||
|
||
import com.fasterxml.jackson.databind.node.ObjectNode; | ||
import com.yugabyte.yw.forms.CustomerRegisterFormData; | ||
import com.yugabyte.yw.models.*; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
import play.libs.Json; | ||
import play.Configuration; | ||
|
||
import javax.inject.Inject; | ||
import javax.inject.Singleton; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
@Singleton | ||
public class AlertManager { | ||
@Inject | ||
HealthManager healthManager; | ||
|
||
@Inject | ||
Configuration appConfig; | ||
|
||
|
||
public static final Logger LOG = LoggerFactory.getLogger(AlertManager.class); | ||
|
||
public void sendEmail(Alert alert, String state) { | ||
if (!alert.sendEmail) { | ||
return; | ||
} | ||
|
||
AlertDefinition definition = AlertDefinition.get(alert.definitionUUID); | ||
Universe universe = Universe.get(definition.universeUUID); | ||
ObjectNode alertData = Json.newObject() | ||
.put("alert_name", definition.name) | ||
.put("state", state) | ||
.put("universe_name", universe.name); | ||
Customer customer = Customer.get(alert.customerUUID); | ||
String customerTag = String.format("[%s][%s]", customer.name, customer.code); | ||
List<String> destinations = new ArrayList<>(); | ||
String ybEmail = appConfig.getString("yb.health.default_email", null); | ||
CustomerConfig config = CustomerConfig.getAlertConfig(customer.uuid); | ||
CustomerRegisterFormData.AlertingData alertingData = | ||
Json.fromJson(config.data, CustomerRegisterFormData.AlertingData.class); | ||
if (alertingData.sendAlertsToYb && ybEmail != null && !ybEmail.isEmpty()) { | ||
destinations.add(ybEmail); | ||
} | ||
|
||
if (alertingData.alertingEmail != null && !alertingData.alertingEmail.isEmpty()) { | ||
destinations.add(alertingData.alertingEmail); | ||
} | ||
|
||
// Skip sending email if there aren't any destinations to send it to | ||
if (destinations.isEmpty()) { | ||
return; | ||
} | ||
|
||
CustomerConfig smtpConfig = CustomerConfig.getSmtpConfig(customer.uuid); | ||
CustomerRegisterFormData.SmtpData smtpData = null; | ||
if (smtpConfig != null) { | ||
smtpData = Json.fromJson(smtpConfig.data, CustomerRegisterFormData.SmtpData.class); | ||
} | ||
|
||
healthManager.runCommand( | ||
customerTag, | ||
String.join(",", destinations), | ||
smtpData, | ||
alertData | ||
); | ||
} | ||
|
||
/** | ||
* A method to run a state transition for a given alert | ||
* | ||
* @param alert the alert to transition states on | ||
* @return the alert in a new state | ||
*/ | ||
public Alert transitionAlert(Alert alert) { | ||
try { | ||
switch (alert.state) { | ||
case CREATED: | ||
LOG.info("Transitioning alert {} to active", alert.uuid); | ||
sendEmail(alert, "firing"); | ||
alert.state = Alert.State.ACTIVE; | ||
break; | ||
case ACTIVE: | ||
LOG.info("Transitioning alert {} to resolved", alert.uuid); | ||
sendEmail(alert, "resolved"); | ||
alert.state = Alert.State.RESOLVED; | ||
break; | ||
case RESOLVED: | ||
LOG.info("Transitioning alert {} to resolved", alert.uuid); | ||
alert.state = Alert.State.RESOLVED; | ||
break; | ||
} | ||
|
||
alert.save(); | ||
} catch (Exception e) { | ||
LOG.error("Error transitioning alert state for alert {}", alert.uuid, e); | ||
} | ||
|
||
return alert; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.