Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: added slo duplicates check during validate (#1) #2

Merged
merged 1 commit into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 40 additions & 8 deletions cmd/sloth/commands/validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ type validateCommand struct {
sliPluginsPaths []string
sloPeriodWindowsPath string
sloPeriod string
ignoreSloDuplicates bool
}

// NewValidateCommand returns the validate command.
Expand All @@ -40,7 +41,7 @@ func NewValidateCommand(app *kingpin.Application) Command {
cmd.Flag("sli-plugins-path", "The path to SLI plugins (can be repeated), if not set it disable plugins support.").Short('p').StringsVar(&c.sliPluginsPaths)
cmd.Flag("slo-period-windows-path", "The directory path to custom SLO period windows catalog (replaces default ones).").StringVar(&c.sloPeriodWindowsPath)
cmd.Flag("default-slo-period", "The default SLO period windows to be used for the SLOs.").Default("30d").StringVar(&c.sloPeriod)

cmd.Flag("ignore-slo-duplicates", "Flag to ignore SLO duplicates in specs (service and name used as an SLO/SLI identifier).").Default("false").BoolVar(&c.ignoreSloDuplicates)
return c
}

Expand Down Expand Up @@ -115,6 +116,7 @@ func (v validateCommand) Run(ctx context.Context, config RootConfig) error {
// For every file load the data and start the validation process:
validations := []*fileValidation{}
totalValidations := 0
sloIDs := make(map[string]string, 0)
for _, input := range sloPaths {
// Get SLO spec data.
slxData, err := os.ReadFile(input)
Expand Down Expand Up @@ -142,11 +144,21 @@ func (v validateCommand) Run(ctx context.Context, config RootConfig) error {
// Match the spec type to know how to validate.
switch {
case promYAMLLoader.IsSpecType(ctx, dataB):
slos, promErr := promYAMLLoader.LoadSpec(ctx, dataB)
sloGroup, promErr := promYAMLLoader.LoadSpec(ctx, dataB)
if promErr == nil {
err := gen.GeneratePrometheus(ctx, *slos, io.Discard)
if !v.ignoreSloDuplicates {
for _, slo := range sloGroup.SLOs {
if sloFile, exists := sloIDs[slo.ID]; !exists {
sloIDs[slo.ID] = validation.File
} else {
validation.Errs = append(validation.Errs, fmt.Errorf("SLO duplicated. SLO{service=%s, name=%s}, ID=%s already exists in a file: %s", slo.Service, slo.Name, slo.ID, sloFile))
}
}
}

err := gen.GeneratePrometheus(ctx, *sloGroup, io.Discard)
if err != nil {
validation.Errs = []error{fmt.Errorf("Could not generate Prometheus format rules: %w", err)}
validation.Errs = append(validation.Errs, fmt.Errorf("Could not generate Prometheus format rules: %w", err))
}
continue
}
Expand All @@ -156,21 +168,41 @@ func (v validateCommand) Run(ctx context.Context, config RootConfig) error {
case kubeYAMLLoader.IsSpecType(ctx, dataB):
sloGroup, k8sErr := kubeYAMLLoader.LoadSpec(ctx, dataB)
if k8sErr == nil {
if !v.ignoreSloDuplicates {
for _, slo := range sloGroup.SLOs {
if sloFile, exists := sloIDs[slo.ID]; !exists {
sloIDs[slo.ID] = validation.File
} else {
validation.Errs = append(validation.Errs, fmt.Errorf("SLO duplicated. SLO{service=%s, name=%s}, ID=%s already exists in a file: %s", slo.Service, slo.Name, slo.ID, sloFile))
}
}
}

err := gen.GenerateKubernetes(ctx, *sloGroup, io.Discard)
if err != nil {
validation.Errs = []error{fmt.Errorf("could not generate Kubernetes format rules: %w", err)}
validation.Errs = append(validation.Errs, fmt.Errorf("could not generate Kubernetes format rules: %w", err))
}
continue
}

validation.Errs = []error{fmt.Errorf("Tried loading Kubernetes prometheus SLOs spec, it couldn't: %w", k8sErr)}

case openSLOYAMLLoader.IsSpecType(ctx, dataB):
slos, openSLOErr := openSLOYAMLLoader.LoadSpec(ctx, dataB)
sloGroup, openSLOErr := openSLOYAMLLoader.LoadSpec(ctx, dataB)
if openSLOErr == nil {
err := gen.GenerateOpenSLO(ctx, *slos, io.Discard)
if !v.ignoreSloDuplicates {
for _, slo := range sloGroup.SLOs {
if sloFile, exists := sloIDs[slo.ID]; !exists {
sloIDs[slo.ID] = validation.File
} else {
validation.Errs = append(validation.Errs, fmt.Errorf("SLO duplicated. SLO{service=%s, name=%s}, ID=%s already exists in a file: %s", slo.Service, slo.Name, slo.ID, sloFile))
}
}
}

err := gen.GenerateOpenSLO(ctx, *sloGroup, io.Discard)
if err != nil {
validation.Errs = []error{fmt.Errorf("Could not generate OpenSLO format rules: %w", err)}
validation.Errs = append(validation.Errs, fmt.Errorf("Could not generate OpenSLO format rules: %w", err))
}
continue
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
---
version: "prometheus/v1"
service: "svc01"
labels:
global01k1: global01v1
slos:
- name: "slo1"
objective: 99.9
description: "This is SLO 01."
labels:
global02k1: global02v1
sli:
events:
error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
name: myServiceAlert
labels:
alert01k1: "alert01v1"
annotations:
alert02k1: "alert02k2"
pageAlert:
labels:
alert03k1: "alert03v1"
ticketAlert:
labels:
alert04k1: "alert04v1"
- name: "slo02"
objective: 95
description: "This is SLO 02."
labels:
global03k1: global03v1
sli:
raw:
error_ratio_query: |
sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
/
sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
page_alert:
disable: true
ticket_alert:
disable: true

---
version: "prometheus/v1"
service: "svc01"
labels:
global01k1: global01v1
slos:
- name: "slo1" # duplicate
objective: 99.9
description: "This is SLO 01."
labels:
global02k1: global02v1
sli:
events:
error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
name: myServiceAlert
labels:
alert01k1: "alert01v1"
annotations:
alert02k1: "alert02k2"
pageAlert:
labels:
alert03k1: "alert03v1"
ticketAlert:
labels:
alert04k1: "alert04v1"
- name: "slo02" # duplicate
objective: 95
description: "This is SLO 02."
labels:
global03k1: global03v1
sli:
raw:
error_ratio_query: |
sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
/
sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
page_alert:
disable: true
ticket_alert:
disable: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: sloth.slok.dev/v1
kind: PrometheusServiceLevel
metadata:
name: svc
namespace: test-ns
spec:
service: "svc01"
labels:
global01k1: global01v1
slos:
- name: "slo1"
objective: 99.9
description: "This is SLO 01."
labels:
global02k1: global02v1
sli:
events:
errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
name: myServiceAlert
labels:
alert01k1: "alert01v1"
annotations:
alert02k1: "alert02k2"
pageAlert:
labels:
alert03k1: "alert03v1"
ticketAlert:
labels:
alert04k1: "alert04v1"
- name: "slo02"
objective: 95
description: "This is SLO 02."
labels:
global03k1: global03v1
sli:
raw:
errorRatioQuery: |
sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
/
sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
pageAlert:
disable: true
ticketAlert:
disable: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: sloth.slok.dev/v1
kind: PrometheusServiceLevel
metadata:
name: svc
namespace: test-ns
spec:
service: "svc01"
labels:
global01k1: global01v1
slos:
- name: "slo1"
objective: 99.9
description: "This is SLO 01."
labels:
global02k1: global02v1
sli:
events:
errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
name: myServiceAlert
labels:
alert01k1: "alert01v1"
annotations:
alert02k1: "alert02k2"
pageAlert:
labels:
alert03k1: "alert03v1"
ticketAlert:
labels:
alert04k1: "alert04v1"
- name: "slo02"
objective: 95
description: "This is SLO 02."
labels:
global03k1: global03v1
sli:
raw:
errorRatioQuery: |
sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
/
sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
pageAlert:
disable: true
ticketAlert:
disable: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: openslo/v1alpha
kind: SLO
metadata:
name: slo1
displayName: Integration test SLO1
spec:
service: svc01
description: "this is SLO1."
budgetingMethod: Occurrences
objectives:
- ratioMetrics:
good:
source: prometheus
queryType: promql
query: sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[{{.window}}]))
total:
source: prometheus
queryType: promql
query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
target: 0.999
timeWindows:
- count: 30
unit: Day
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: openslo/v1alpha
kind: SLO
metadata:
name: slo1
displayName: Integration test SLO1
spec:
service: svc01
description: "this is SLO1."
budgetingMethod: Occurrences
objectives:
- ratioMetrics:
good:
source: prometheus
queryType: promql
query: sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[{{.window}}]))
total:
source: prometheus
queryType: promql
query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
target: 0.999
timeWindows:
- count: 30
unit: Day
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
version: "prometheus/v1"
service: "svc01"
labels:
global01k1: global01v1
slos:
- name: "slo1"
objective: 99.9
description: "This is SLO 01."
labels:
global02k1: global02v1
sli:
events:
error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
name: myServiceAlert
labels:
alert01k1: "alert01v1"
annotations:
alert02k1: "alert02k2"
pageAlert:
labels:
alert03k1: "alert03v1"
ticketAlert:
labels:
alert04k1: "alert04v1"
- name: "slo02"
objective: 95
description: "This is SLO 02."
labels:
global03k1: global03v1
sli:
raw:
error_ratio_query: |
sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
/
sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
page_alert:
disable: true
ticket_alert:
disable: true
Loading