Skip to content

Commit

Permalink
feat: Add support for Prometheus AlertManager (#89)
Browse files Browse the repository at this point in the history
* feat: Add support for Prometheus AlertManager in `alertmanager`

* chore: Add default value for CurioAlertingConfig

* chore: make the payload label key lowercase && add source to the labels

* fix: Add default time for AlertPayload

* update doc
  • Loading branch information
strahe authored Jul 15, 2024
1 parent a1b7625 commit 147a77e
Show file tree
Hide file tree
Showing 8 changed files with 426 additions and 138 deletions.
41 changes: 41 additions & 0 deletions alertmanager/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
Package alertmanager provides a framework for monitoring and alerting within the Curio project. It supports dynamic plugin integration for alert notifications, allowing for flexible and extensible alerting mechanisms.
Implementing a New Plugin:
1. Define a struct that implements the Plugin interface, which includes the SendAlert method for dispatching alerts.
2. Implement the SendAlert method to handle the alert logic specific to your plugin.
3. Provide a constructor function for your plugin to facilitate its configuration and initialization.
4. Register your plugin in the LoadAlertPlugins function, which dynamically loads plugins based on the CurioAlertingConfig.
Plugin Configuration:
Plugins are configured through the config.CurioAlertingConfig struct. Each plugin can have its own configuration section within this struct, enabling or disabling the plugin and setting plugin-specific parameters.
Example:
```go
type MyPlugin struct{}
func (p *MyPlugin) SendAlert(data *plugin.AlertPayload) error {
// Plugin-specific alert sending logic
return nil
}
func NewMyPlugin() *MyPlugin {
return &MyPlugin{}
}
func LoadAlertPlugins(cfg config.CurioAlertingConfig) []plugin.Plugin {
var plugins []plugin.Plugin
if cfg.MyPlugin.Enabled {
plugins = append(plugins, NewMyPlugin())
}
return plugins
}
```
This package leverages the CurioAlertingConfig for plugin configuration,
enabling a modular approach to adding or removing alerting capabilities as required.
*/
package alertmanager
111 changes: 111 additions & 0 deletions alertmanager/plugin/pager_duty.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package plugin

import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"time"

"golang.org/x/xerrors"

"github.com/filecoin-project/curio/deps/config"
"github.com/samber/lo"
)

type PagerDuty struct {
cfg config.PagerDutyConfig
}

func NewPagerDuty(cfg config.PagerDutyConfig) Plugin {
return &PagerDuty{
cfg: cfg,
}
}

// SendAlert sends an alert to PagerDuty with the provided payload data.
// It creates a PDData struct with the provided routing key, event action and payload.
// It creates an HTTP POST request with the PagerDuty event URL as the endpoint and the marshaled JSON data as the request body.
// It sends the request using an HTTP client with a maximum of 5 retries for network errors with exponential backoff before each retry.
// It handles different HTTP response status codes and returns an error based on the status code().
// If all retries fail, it returns an error indicating the last network error encountered.
func (p *PagerDuty) SendAlert(data *AlertPayload) error {

type pdPayload struct {
Summary string `json:"summary"`
Severity string `json:"severity"`
Source string `json:"source"`
Component string `json:"component,omitempty"`
Group string `json:"group,omitempty"`
Class string `json:"class,omitempty"`
CustomDetails interface{} `json:"custom_details,omitempty"`
}

type pdData struct {
RoutingKey string `json:"routing_key"`
EventAction string `json:"event_action"`
Payload *pdPayload `json:"payload"`
}

payload := &pdData{
RoutingKey: p.cfg.PageDutyIntegrationKey,
EventAction: "trigger",
Payload: &pdPayload{
Summary: data.Summary,
Severity: data.Severity,
Source: data.Source,
CustomDetails: data.Details,
},
}

jsonData, err := json.Marshal(payload)
if err != nil {
return fmt.Errorf("error marshaling JSON: %w", err)
}

req, err := http.NewRequest("POST", p.cfg.PagerDutyEventURL, bytes.NewBuffer(jsonData))
if err != nil {
return fmt.Errorf("error creating request: %w", err)
}
req.Header.Set("Content-Type", "application/json")

client := &http.Client{
Timeout: time.Second * 15,
}
iter, _, err := lo.AttemptWithDelay(5, time.Second,
func(index int, duration time.Duration) error {
resp, err := client.Do(req)
if err != nil {
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff
return err
}
defer func() { _ = resp.Body.Close() }()

switch resp.StatusCode {
case 202:
log.Debug("Accepted: The event has been accepted by PagerDuty.")
return nil
case 400:
bd, rerr := io.ReadAll(resp.Body)
if rerr != nil {
return xerrors.Errorf("Bad request: payload JSON is invalid. Failed to read the body: %w", err)
}
return xerrors.Errorf("Bad request: payload JSON is invalid %s", string(bd))
case 429:
log.Debug("Too many API calls, retrying after backoff...")
time.Sleep(time.Duration(5*index) * time.Second) // Exponential backoff
case 500, 501, 502, 503, 504:
log.Debug("Server error, retrying after backoff...")
time.Sleep(time.Duration(5*index) * time.Second) // Exponential backoff
default:
log.Errorw("Response status:", resp.Status)
return xerrors.Errorf("Unexpected HTTP response: %s", resp.Status)
}
return nil
})
if err != nil {
return fmt.Errorf("after %d retries,last error: %w", iter, err)
}
return nil
}
34 changes: 34 additions & 0 deletions alertmanager/plugin/plugin.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package plugin

import (
"time"

"github.com/filecoin-project/curio/deps/config"

logging "github.com/ipfs/go-log/v2"
)

var log = logging.Logger("curio/alertplugins")

type Plugin interface {
SendAlert(data *AlertPayload) error
}

type AlertPayload struct {
Summary string
Severity string
Source string
Details map[string]interface{}
Time time.Time
}

func LoadAlertPlugins(cfg config.CurioAlertingConfig) []Plugin {
var plugins []Plugin
if cfg.PagerDuty.Enable {
plugins = append(plugins, NewPagerDuty(cfg.PagerDuty))
}
if cfg.PrometheusAlertManager.Enable {
plugins = append(plugins, NewPrometheusAlertManager(cfg.PrometheusAlertManager))
}
return plugins
}
99 changes: 99 additions & 0 deletions alertmanager/plugin/prometheus_alertmanager.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package plugin

import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"time"

"github.com/filecoin-project/curio/deps/config"
"github.com/samber/lo"
)

type PrometheusAlertManager struct {
cfg config.PrometheusAlertManagerConfig
}

func NewPrometheusAlertManager(cfg config.PrometheusAlertManagerConfig) Plugin {
return &PrometheusAlertManager{
cfg: cfg,
}
}

// SendAlert sends an alert to Prometheus AlertManager with the provided payload data.
// API reference: https://raw.githubusercontent.com/prometheus/alertmanager/main/api/v2/openapi.yaml
func (p *PrometheusAlertManager) SendAlert(data *AlertPayload) error {
if len(data.Details) == 0 {
return nil
}

type amPayload struct {
StartsAt time.Time `json:"startsAt"`
EndsAt *time.Time `json:"EndsAt,omitempty"`
Annotations map[string]interface{} `json:"annotations"`
Labels map[string]string `json:"labels"`

// is a unique back-link which identifies the causing entity of this alert
//GeneratorURL string `json:"generatorURL"`
}

var alerts []*amPayload
for k, v := range data.Details {
alerts = append(alerts, &amPayload{
StartsAt: data.Time,
Labels: map[string]string{
"alertName": k,
"severity": data.Severity,
"instance": data.Source,
},
Annotations: map[string]interface{}{
"summary": data.Summary,
"details": v,
},
})
}

jsonData, err := json.Marshal(alerts)
if err != nil {
return fmt.Errorf("error marshaling JSON: %w", err)
}
req, err := http.NewRequest("POST", p.cfg.AlertManagerURL, bytes.NewBuffer(jsonData))
if err != nil {
return fmt.Errorf("error creating request: %w", err)
}
req.Header.Set("Content-Type", "application/json")

client := &http.Client{
Timeout: time.Second * 15,
}
iter, _, err := lo.AttemptWithDelay(5, time.Second,
func(index int, duration time.Duration) error {
resp, err := client.Do(req)
if err != nil {
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff
return err
}
defer func() { _ = resp.Body.Close() }()

switch resp.StatusCode {
case http.StatusOK:
return nil
case http.StatusBadRequest, http.StatusInternalServerError:
errBody, err := io.ReadAll(resp.Body)
if err != nil {
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff
return err
}
return fmt.Errorf("error: %s", string(errBody))
default:
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff
return fmt.Errorf("unexpected HTTP response: %s", resp.Status)
}
})
if err != nil {
return fmt.Errorf("after %d retries,last error: %w", iter, err)
}
return nil
}
Loading

0 comments on commit 147a77e

Please sign in to comment.