-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add support for Prometheus AlertManager (#89)
* feat: Add support for Prometheus AlertManager in `alertmanager` * chore: Add default value for CurioAlertingConfig * chore: make the payload label key lowercase && add source to the labels * fix: Add default time for AlertPayload * update doc
- Loading branch information
Showing
8 changed files
with
426 additions
and
138 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
/* | ||
Package alertmanager provides a framework for monitoring and alerting within the Curio project. It supports dynamic plugin integration for alert notifications, allowing for flexible and extensible alerting mechanisms. | ||
Implementing a New Plugin: | ||
1. Define a struct that implements the Plugin interface, which includes the SendAlert method for dispatching alerts. | ||
2. Implement the SendAlert method to handle the alert logic specific to your plugin. | ||
3. Provide a constructor function for your plugin to facilitate its configuration and initialization. | ||
4. Register your plugin in the LoadAlertPlugins function, which dynamically loads plugins based on the CurioAlertingConfig. | ||
Plugin Configuration: | ||
Plugins are configured through the config.CurioAlertingConfig struct. Each plugin can have its own configuration section within this struct, enabling or disabling the plugin and setting plugin-specific parameters. | ||
Example: | ||
```go | ||
type MyPlugin struct{} | ||
func (p *MyPlugin) SendAlert(data *plugin.AlertPayload) error { | ||
// Plugin-specific alert sending logic | ||
return nil | ||
} | ||
func NewMyPlugin() *MyPlugin { | ||
return &MyPlugin{} | ||
} | ||
func LoadAlertPlugins(cfg config.CurioAlertingConfig) []plugin.Plugin { | ||
var plugins []plugin.Plugin | ||
if cfg.MyPlugin.Enabled { | ||
plugins = append(plugins, NewMyPlugin()) | ||
} | ||
return plugins | ||
} | ||
``` | ||
This package leverages the CurioAlertingConfig for plugin configuration, | ||
enabling a modular approach to adding or removing alerting capabilities as required. | ||
*/ | ||
package alertmanager |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
package plugin | ||
|
||
import ( | ||
"bytes" | ||
"encoding/json" | ||
"fmt" | ||
"io" | ||
"net/http" | ||
"time" | ||
|
||
"golang.org/x/xerrors" | ||
|
||
"github.com/filecoin-project/curio/deps/config" | ||
"github.com/samber/lo" | ||
) | ||
|
||
type PagerDuty struct { | ||
cfg config.PagerDutyConfig | ||
} | ||
|
||
func NewPagerDuty(cfg config.PagerDutyConfig) Plugin { | ||
return &PagerDuty{ | ||
cfg: cfg, | ||
} | ||
} | ||
|
||
// SendAlert sends an alert to PagerDuty with the provided payload data. | ||
// It creates a PDData struct with the provided routing key, event action and payload. | ||
// It creates an HTTP POST request with the PagerDuty event URL as the endpoint and the marshaled JSON data as the request body. | ||
// It sends the request using an HTTP client with a maximum of 5 retries for network errors with exponential backoff before each retry. | ||
// It handles different HTTP response status codes and returns an error based on the status code(). | ||
// If all retries fail, it returns an error indicating the last network error encountered. | ||
func (p *PagerDuty) SendAlert(data *AlertPayload) error { | ||
|
||
type pdPayload struct { | ||
Summary string `json:"summary"` | ||
Severity string `json:"severity"` | ||
Source string `json:"source"` | ||
Component string `json:"component,omitempty"` | ||
Group string `json:"group,omitempty"` | ||
Class string `json:"class,omitempty"` | ||
CustomDetails interface{} `json:"custom_details,omitempty"` | ||
} | ||
|
||
type pdData struct { | ||
RoutingKey string `json:"routing_key"` | ||
EventAction string `json:"event_action"` | ||
Payload *pdPayload `json:"payload"` | ||
} | ||
|
||
payload := &pdData{ | ||
RoutingKey: p.cfg.PageDutyIntegrationKey, | ||
EventAction: "trigger", | ||
Payload: &pdPayload{ | ||
Summary: data.Summary, | ||
Severity: data.Severity, | ||
Source: data.Source, | ||
CustomDetails: data.Details, | ||
}, | ||
} | ||
|
||
jsonData, err := json.Marshal(payload) | ||
if err != nil { | ||
return fmt.Errorf("error marshaling JSON: %w", err) | ||
} | ||
|
||
req, err := http.NewRequest("POST", p.cfg.PagerDutyEventURL, bytes.NewBuffer(jsonData)) | ||
if err != nil { | ||
return fmt.Errorf("error creating request: %w", err) | ||
} | ||
req.Header.Set("Content-Type", "application/json") | ||
|
||
client := &http.Client{ | ||
Timeout: time.Second * 15, | ||
} | ||
iter, _, err := lo.AttemptWithDelay(5, time.Second, | ||
func(index int, duration time.Duration) error { | ||
resp, err := client.Do(req) | ||
if err != nil { | ||
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff | ||
return err | ||
} | ||
defer func() { _ = resp.Body.Close() }() | ||
|
||
switch resp.StatusCode { | ||
case 202: | ||
log.Debug("Accepted: The event has been accepted by PagerDuty.") | ||
return nil | ||
case 400: | ||
bd, rerr := io.ReadAll(resp.Body) | ||
if rerr != nil { | ||
return xerrors.Errorf("Bad request: payload JSON is invalid. Failed to read the body: %w", err) | ||
} | ||
return xerrors.Errorf("Bad request: payload JSON is invalid %s", string(bd)) | ||
case 429: | ||
log.Debug("Too many API calls, retrying after backoff...") | ||
time.Sleep(time.Duration(5*index) * time.Second) // Exponential backoff | ||
case 500, 501, 502, 503, 504: | ||
log.Debug("Server error, retrying after backoff...") | ||
time.Sleep(time.Duration(5*index) * time.Second) // Exponential backoff | ||
default: | ||
log.Errorw("Response status:", resp.Status) | ||
return xerrors.Errorf("Unexpected HTTP response: %s", resp.Status) | ||
} | ||
return nil | ||
}) | ||
if err != nil { | ||
return fmt.Errorf("after %d retries,last error: %w", iter, err) | ||
} | ||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
package plugin | ||
|
||
import ( | ||
"time" | ||
|
||
"github.com/filecoin-project/curio/deps/config" | ||
|
||
logging "github.com/ipfs/go-log/v2" | ||
) | ||
|
||
var log = logging.Logger("curio/alertplugins") | ||
|
||
type Plugin interface { | ||
SendAlert(data *AlertPayload) error | ||
} | ||
|
||
type AlertPayload struct { | ||
Summary string | ||
Severity string | ||
Source string | ||
Details map[string]interface{} | ||
Time time.Time | ||
} | ||
|
||
func LoadAlertPlugins(cfg config.CurioAlertingConfig) []Plugin { | ||
var plugins []Plugin | ||
if cfg.PagerDuty.Enable { | ||
plugins = append(plugins, NewPagerDuty(cfg.PagerDuty)) | ||
} | ||
if cfg.PrometheusAlertManager.Enable { | ||
plugins = append(plugins, NewPrometheusAlertManager(cfg.PrometheusAlertManager)) | ||
} | ||
return plugins | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
package plugin | ||
|
||
import ( | ||
"bytes" | ||
"encoding/json" | ||
"fmt" | ||
"io" | ||
"net/http" | ||
"time" | ||
|
||
"github.com/filecoin-project/curio/deps/config" | ||
"github.com/samber/lo" | ||
) | ||
|
||
type PrometheusAlertManager struct { | ||
cfg config.PrometheusAlertManagerConfig | ||
} | ||
|
||
func NewPrometheusAlertManager(cfg config.PrometheusAlertManagerConfig) Plugin { | ||
return &PrometheusAlertManager{ | ||
cfg: cfg, | ||
} | ||
} | ||
|
||
// SendAlert sends an alert to Prometheus AlertManager with the provided payload data. | ||
// API reference: https://raw.githubusercontent.com/prometheus/alertmanager/main/api/v2/openapi.yaml | ||
func (p *PrometheusAlertManager) SendAlert(data *AlertPayload) error { | ||
if len(data.Details) == 0 { | ||
return nil | ||
} | ||
|
||
type amPayload struct { | ||
StartsAt time.Time `json:"startsAt"` | ||
EndsAt *time.Time `json:"EndsAt,omitempty"` | ||
Annotations map[string]interface{} `json:"annotations"` | ||
Labels map[string]string `json:"labels"` | ||
|
||
// is a unique back-link which identifies the causing entity of this alert | ||
//GeneratorURL string `json:"generatorURL"` | ||
} | ||
|
||
var alerts []*amPayload | ||
for k, v := range data.Details { | ||
alerts = append(alerts, &amPayload{ | ||
StartsAt: data.Time, | ||
Labels: map[string]string{ | ||
"alertName": k, | ||
"severity": data.Severity, | ||
"instance": data.Source, | ||
}, | ||
Annotations: map[string]interface{}{ | ||
"summary": data.Summary, | ||
"details": v, | ||
}, | ||
}) | ||
} | ||
|
||
jsonData, err := json.Marshal(alerts) | ||
if err != nil { | ||
return fmt.Errorf("error marshaling JSON: %w", err) | ||
} | ||
req, err := http.NewRequest("POST", p.cfg.AlertManagerURL, bytes.NewBuffer(jsonData)) | ||
if err != nil { | ||
return fmt.Errorf("error creating request: %w", err) | ||
} | ||
req.Header.Set("Content-Type", "application/json") | ||
|
||
client := &http.Client{ | ||
Timeout: time.Second * 15, | ||
} | ||
iter, _, err := lo.AttemptWithDelay(5, time.Second, | ||
func(index int, duration time.Duration) error { | ||
resp, err := client.Do(req) | ||
if err != nil { | ||
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff | ||
return err | ||
} | ||
defer func() { _ = resp.Body.Close() }() | ||
|
||
switch resp.StatusCode { | ||
case http.StatusOK: | ||
return nil | ||
case http.StatusBadRequest, http.StatusInternalServerError: | ||
errBody, err := io.ReadAll(resp.Body) | ||
if err != nil { | ||
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff | ||
return err | ||
} | ||
return fmt.Errorf("error: %s", string(errBody)) | ||
default: | ||
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff | ||
return fmt.Errorf("unexpected HTTP response: %s", resp.Status) | ||
} | ||
}) | ||
if err != nil { | ||
return fmt.Errorf("after %d retries,last error: %w", iter, err) | ||
} | ||
return nil | ||
} |
Oops, something went wrong.