Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add support for Prometheus AlertManager #89

Merged
merged 5 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions alertmanager/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
Package alertmanager provides a framework for monitoring and alerting within the Curio project. It supports dynamic plugin integration for alert notifications, allowing for flexible and extensible alerting mechanisms.

Implementing a New Plugin:

1. Define a struct that implements the Plugin interface, which includes the SendAlert method for dispatching alerts.
2. Implement the SendAlert method to handle the alert logic specific to your plugin.
3. Provide a constructor function for your plugin to facilitate its configuration and initialization.
4. Register your plugin in the LoadAlertPlugins function, which dynamically loads plugins based on the CurioAlertingConfig.

Plugin Configuration:

Plugins are configured through the config.CurioAlertingConfig struct. Each plugin can have its own configuration section within this struct, enabling or disabling the plugin and setting plugin-specific parameters.

Example:

```go
type MyPlugin struct{}

func (p *MyPlugin) SendAlert(data *plugin.AlertPayload) error {
// Plugin-specific alert sending logic
return nil
}

func NewMyPlugin() *MyPlugin {
return &MyPlugin{}
}

func LoadAlertPlugins(cfg config.CurioAlertingConfig) []plugin.Plugin {
var plugins []plugin.Plugin
if cfg.MyPlugin.Enabled {
plugins = append(plugins, NewMyPlugin())
}
return plugins
}

```
This package leverages the CurioAlertingConfig for plugin configuration,
enabling a modular approach to adding or removing alerting capabilities as required.
*/
package alertmanager
111 changes: 111 additions & 0 deletions alertmanager/plugin/pager_duty.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package plugin

import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"time"

"golang.org/x/xerrors"

"github.com/filecoin-project/curio/deps/config"
"github.com/samber/lo"
)

type PagerDuty struct {
cfg config.PagerDutyConfig
}

func NewPagerDuty(cfg config.PagerDutyConfig) Plugin {
return &PagerDuty{
cfg: cfg,
}
}

// SendAlert sends an alert to PagerDuty with the provided payload data.
// It creates a PDData struct with the provided routing key, event action and payload.
// It creates an HTTP POST request with the PagerDuty event URL as the endpoint and the marshaled JSON data as the request body.
// It sends the request using an HTTP client with a maximum of 5 retries for network errors with exponential backoff before each retry.
// It handles different HTTP response status codes and returns an error based on the status code().
// If all retries fail, it returns an error indicating the last network error encountered.
func (p *PagerDuty) SendAlert(data *AlertPayload) error {

type pdPayload struct {
Summary string `json:"summary"`
Severity string `json:"severity"`
Source string `json:"source"`
Component string `json:"component,omitempty"`
Group string `json:"group,omitempty"`
Class string `json:"class,omitempty"`
CustomDetails interface{} `json:"custom_details,omitempty"`
}

type pdData struct {
RoutingKey string `json:"routing_key"`
EventAction string `json:"event_action"`
Payload *pdPayload `json:"payload"`
}

payload := &pdData{
RoutingKey: p.cfg.PageDutyIntegrationKey,
EventAction: "trigger",
Payload: &pdPayload{
Summary: data.Summary,
Severity: data.Severity,
Source: data.Source,
CustomDetails: data.Details,
},
}

jsonData, err := json.Marshal(payload)
if err != nil {
return fmt.Errorf("error marshaling JSON: %w", err)
}

req, err := http.NewRequest("POST", p.cfg.PagerDutyEventURL, bytes.NewBuffer(jsonData))
if err != nil {
return fmt.Errorf("error creating request: %w", err)
}
req.Header.Set("Content-Type", "application/json")

client := &http.Client{
Timeout: time.Second * 15,
}
iter, _, err := lo.AttemptWithDelay(5, time.Second,
func(index int, duration time.Duration) error {
resp, err := client.Do(req)
if err != nil {
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff
return err
}
defer func() { _ = resp.Body.Close() }()

switch resp.StatusCode {
case 202:
log.Debug("Accepted: The event has been accepted by PagerDuty.")
return nil
case 400:
bd, rerr := io.ReadAll(resp.Body)
if rerr != nil {
return xerrors.Errorf("Bad request: payload JSON is invalid. Failed to read the body: %w", err)
}
return xerrors.Errorf("Bad request: payload JSON is invalid %s", string(bd))
case 429:
log.Debug("Too many API calls, retrying after backoff...")
time.Sleep(time.Duration(5*index) * time.Second) // Exponential backoff
case 500, 501, 502, 503, 504:
log.Debug("Server error, retrying after backoff...")
time.Sleep(time.Duration(5*index) * time.Second) // Exponential backoff
default:
log.Errorw("Response status:", resp.Status)
return xerrors.Errorf("Unexpected HTTP response: %s", resp.Status)
}
return nil
})
if err != nil {
return fmt.Errorf("after %d retries,last error: %w", iter, err)
}
return nil
}
34 changes: 34 additions & 0 deletions alertmanager/plugin/plugin.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package plugin

import (
"time"

"github.com/filecoin-project/curio/deps/config"

logging "github.com/ipfs/go-log/v2"
)

var log = logging.Logger("curio/alertplugins")

type Plugin interface {
SendAlert(data *AlertPayload) error
}

type AlertPayload struct {
Summary string
Severity string
Source string
Details map[string]interface{}
Time time.Time
}

func LoadAlertPlugins(cfg config.CurioAlertingConfig) []Plugin {
var plugins []Plugin
if cfg.PagerDuty.Enable {
plugins = append(plugins, NewPagerDuty(cfg.PagerDuty))
}
if cfg.PrometheusAlertManager.Enable {
plugins = append(plugins, NewPrometheusAlertManager(cfg.PrometheusAlertManager))
}
return plugins
}
99 changes: 99 additions & 0 deletions alertmanager/plugin/prometheus_alertmanager.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package plugin

import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"time"

"github.com/filecoin-project/curio/deps/config"
"github.com/samber/lo"
)

type PrometheusAlertManager struct {
cfg config.PrometheusAlertManagerConfig
}

func NewPrometheusAlertManager(cfg config.PrometheusAlertManagerConfig) Plugin {
return &PrometheusAlertManager{
cfg: cfg,
}
}

// SendAlert sends an alert to Prometheus AlertManager with the provided payload data.
// API reference: https://raw.githubusercontent.com/prometheus/alertmanager/main/api/v2/openapi.yaml
func (p *PrometheusAlertManager) SendAlert(data *AlertPayload) error {
if len(data.Details) == 0 {
return nil
}

type amPayload struct {
StartsAt time.Time `json:"startsAt"`
EndsAt *time.Time `json:"EndsAt,omitempty"`
Annotations map[string]interface{} `json:"annotations"`
Labels map[string]string `json:"labels"`

// is a unique back-link which identifies the causing entity of this alert
//GeneratorURL string `json:"generatorURL"`
}

var alerts []*amPayload
for k, v := range data.Details {
alerts = append(alerts, &amPayload{
StartsAt: data.Time,
Labels: map[string]string{
"alertName": k,
"severity": data.Severity,
"instance": data.Source,
},
Annotations: map[string]interface{}{
"summary": data.Summary,
"details": v,
},
})
}

jsonData, err := json.Marshal(alerts)
if err != nil {
return fmt.Errorf("error marshaling JSON: %w", err)
}
req, err := http.NewRequest("POST", p.cfg.AlertManagerURL, bytes.NewBuffer(jsonData))
if err != nil {
return fmt.Errorf("error creating request: %w", err)
}
req.Header.Set("Content-Type", "application/json")

client := &http.Client{
Timeout: time.Second * 15,
}
iter, _, err := lo.AttemptWithDelay(5, time.Second,
func(index int, duration time.Duration) error {
resp, err := client.Do(req)
if err != nil {
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff
return err
}
defer func() { _ = resp.Body.Close() }()

switch resp.StatusCode {
case http.StatusOK:
return nil
case http.StatusBadRequest, http.StatusInternalServerError:
errBody, err := io.ReadAll(resp.Body)
if err != nil {
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff
return err
}
return fmt.Errorf("error: %s", string(errBody))
default:
time.Sleep(time.Duration(2*index) * duration) // Exponential backoff
return fmt.Errorf("unexpected HTTP response: %s", resp.Status)
}
})
if err != nil {
return fmt.Errorf("after %d retries,last error: %w", iter, err)
}
return nil
}
Loading