Skip to content

Commit

Permalink
feat: added slo duplicates check during validate
Browse files Browse the repository at this point in the history
Issue slok#493
  • Loading branch information
r3code committed Apr 4, 2023
1 parent a9d9dc4 commit 1b76acb
Show file tree
Hide file tree
Showing 9 changed files with 367 additions and 8 deletions.
48 changes: 40 additions & 8 deletions cmd/sloth/commands/validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ type validateCommand struct {
sliPluginsPaths []string
sloPeriodWindowsPath string
sloPeriod string
ignoreSloDuplicates bool
}

// NewValidateCommand returns the validate command.
Expand All @@ -40,7 +41,7 @@ func NewValidateCommand(app *kingpin.Application) Command {
cmd.Flag("sli-plugins-path", "The path to SLI plugins (can be repeated), if not set it disable plugins support.").Short('p').StringsVar(&c.sliPluginsPaths)
cmd.Flag("slo-period-windows-path", "The directory path to custom SLO period windows catalog (replaces default ones).").StringVar(&c.sloPeriodWindowsPath)
cmd.Flag("default-slo-period", "The default SLO period windows to be used for the SLOs.").Default("30d").StringVar(&c.sloPeriod)

cmd.Flag("ignore-slo-duplicates", "Flag to ignore SLO duplicates in specs (service and name used as an SLO/SLI identifier).").Default("false").BoolVar(&c.ignoreSloDuplicates)
return c
}

Expand Down Expand Up @@ -115,6 +116,7 @@ func (v validateCommand) Run(ctx context.Context, config RootConfig) error {
// For every file load the data and start the validation process:
validations := []*fileValidation{}
totalValidations := 0
sloIDs := make(map[string]string, 0)
for _, input := range sloPaths {
// Get SLO spec data.
slxData, err := os.ReadFile(input)
Expand Down Expand Up @@ -142,11 +144,21 @@ func (v validateCommand) Run(ctx context.Context, config RootConfig) error {
// Match the spec type to know how to validate.
switch {
case promYAMLLoader.IsSpecType(ctx, dataB):
slos, promErr := promYAMLLoader.LoadSpec(ctx, dataB)
sloGroup, promErr := promYAMLLoader.LoadSpec(ctx, dataB)
if promErr == nil {
err := gen.GeneratePrometheus(ctx, *slos, io.Discard)
if !v.ignoreSloDuplicates {
for _, slo := range sloGroup.SLOs {
if sloFile, exists := sloIDs[slo.ID]; !exists {
sloIDs[slo.ID] = validation.File
} else {
validation.Errs = append(validation.Errs, fmt.Errorf("SLO duplicated. SLO{service=%s, name=%s}, ID=%s already exists in a file: %s", slo.Service, slo.Name, slo.ID, sloFile))
}
}
}

err := gen.GeneratePrometheus(ctx, *sloGroup, io.Discard)
if err != nil {
validation.Errs = []error{fmt.Errorf("Could not generate Prometheus format rules: %w", err)}
validation.Errs = append(validation.Errs, fmt.Errorf("Could not generate Prometheus format rules: %w", err))
}
continue
}
Expand All @@ -156,21 +168,41 @@ func (v validateCommand) Run(ctx context.Context, config RootConfig) error {
case kubeYAMLLoader.IsSpecType(ctx, dataB):
sloGroup, k8sErr := kubeYAMLLoader.LoadSpec(ctx, dataB)
if k8sErr == nil {
if !v.ignoreSloDuplicates {
for _, slo := range sloGroup.SLOs {
if sloFile, exists := sloIDs[slo.ID]; !exists {
sloIDs[slo.ID] = validation.File
} else {
validation.Errs = append(validation.Errs, fmt.Errorf("SLO duplicated. SLO{service=%s, name=%s}, ID=%s already exists in a file: %s", slo.Service, slo.Name, slo.ID, sloFile))
}
}
}

err := gen.GenerateKubernetes(ctx, *sloGroup, io.Discard)
if err != nil {
validation.Errs = []error{fmt.Errorf("could not generate Kubernetes format rules: %w", err)}
validation.Errs = append(validation.Errs, fmt.Errorf("could not generate Kubernetes format rules: %w", err)}
}
continue
}

validation.Errs = []error{fmt.Errorf("Tried loading Kubernetes prometheus SLOs spec, it couldn't: %w", k8sErr)}

case openSLOYAMLLoader.IsSpecType(ctx, dataB):
slos, openSLOErr := openSLOYAMLLoader.LoadSpec(ctx, dataB)
sloGroup, openSLOErr := openSLOYAMLLoader.LoadSpec(ctx, dataB)
if openSLOErr == nil {
err := gen.GenerateOpenSLO(ctx, *slos, io.Discard)
if !v.ignoreSloDuplicates {
for _, slo := range sloGroup.SLOs {
if sloFile, exists := sloIDs[slo.ID]; !exists {
sloIDs[slo.ID] = validation.File
} else {
validation.Errs = append(validation.Errs, fmt.Errorf("SLO duplicated. SLO{service=%s, name=%s}, ID=%s already exists in a file: %s", slo.Service, slo.Name, slo.ID, sloFile))
}
}
}

err := gen.GenerateOpenSLO(ctx, *sloGroup, io.Discard)
if err != nil {
validation.Errs = []error{fmt.Errorf("Could not generate OpenSLO format rules: %w", err)}
validation.Errs = append(validation.Errs, fmt.Errorf("Could not generate OpenSLO format rules: %w", err)}
}
continue
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
---
version: "prometheus/v1"
service: "svc01"
labels:
global01k1: global01v1
slos:
- name: "slo1"
objective: 99.9
description: "This is SLO 01."
labels:
global02k1: global02v1
sli:
events:
error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
name: myServiceAlert
labels:
alert01k1: "alert01v1"
annotations:
alert02k1: "alert02k2"
pageAlert:
labels:
alert03k1: "alert03v1"
ticketAlert:
labels:
alert04k1: "alert04v1"
- name: "slo02"
objective: 95
description: "This is SLO 02."
labels:
global03k1: global03v1
sli:
raw:
error_ratio_query: |
sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
/
sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
page_alert:
disable: true
ticket_alert:
disable: true

---
version: "prometheus/v1"
service: "svc01"
labels:
global01k1: global01v1
slos:
- name: "slo1" # duplicate
objective: 99.9
description: "This is SLO 01."
labels:
global02k1: global02v1
sli:
events:
error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
name: myServiceAlert
labels:
alert01k1: "alert01v1"
annotations:
alert02k1: "alert02k2"
pageAlert:
labels:
alert03k1: "alert03v1"
ticketAlert:
labels:
alert04k1: "alert04v1"
- name: "slo02" # duplicate
objective: 95
description: "This is SLO 02."
labels:
global03k1: global03v1
sli:
raw:
error_ratio_query: |
sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
/
sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
page_alert:
disable: true
ticket_alert:
disable: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: sloth.slok.dev/v1
kind: PrometheusServiceLevel
metadata:
name: svc
namespace: test-ns
spec:
service: "svc01"
labels:
global01k1: global01v1
slos:
- name: "slo1"
objective: 99.9
description: "This is SLO 01."
labels:
global02k1: global02v1
sli:
events:
errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
name: myServiceAlert
labels:
alert01k1: "alert01v1"
annotations:
alert02k1: "alert02k2"
pageAlert:
labels:
alert03k1: "alert03v1"
ticketAlert:
labels:
alert04k1: "alert04v1"
- name: "slo02"
objective: 95
description: "This is SLO 02."
labels:
global03k1: global03v1
sli:
raw:
errorRatioQuery: |
sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
/
sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
pageAlert:
disable: true
ticketAlert:
disable: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: sloth.slok.dev/v1
kind: PrometheusServiceLevel
metadata:
name: svc
namespace: test-ns
spec:
service: "svc01"
labels:
global01k1: global01v1
slos:
- name: "slo1"
objective: 99.9
description: "This is SLO 01."
labels:
global02k1: global02v1
sli:
events:
errorQuery: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
totalQuery: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
name: myServiceAlert
labels:
alert01k1: "alert01v1"
annotations:
alert02k1: "alert02k2"
pageAlert:
labels:
alert03k1: "alert03v1"
ticketAlert:
labels:
alert04k1: "alert04v1"
- name: "slo02"
objective: 95
description: "This is SLO 02."
labels:
global03k1: global03v1
sli:
raw:
errorRatioQuery: |
sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
/
sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
pageAlert:
disable: true
ticketAlert:
disable: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: openslo/v1alpha
kind: SLO
metadata:
name: slo1
displayName: Integration test SLO1
spec:
service: svc01
description: "this is SLO1."
budgetingMethod: Occurrences
objectives:
- ratioMetrics:
good:
source: prometheus
queryType: promql
query: sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[{{.window}}]))
total:
source: prometheus
queryType: promql
query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
target: 0.999
timeWindows:
- count: 30
unit: Day
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: openslo/v1alpha
kind: SLO
metadata:
name: slo1
displayName: Integration test SLO1
spec:
service: svc01
description: "this is SLO1."
budgetingMethod: Occurrences
objectives:
- ratioMetrics:
good:
source: prometheus
queryType: promql
query: sum(rate(http_request_duration_seconds_count{job="myservice",code!~"(5..|429)"}[{{.window}}]))
total:
source: prometheus
queryType: promql
query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
target: 0.999
timeWindows:
- count: 30
unit: Day
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
version: "prometheus/v1"
service: "svc01"
labels:
global01k1: global01v1
slos:
- name: "slo1"
objective: 99.9
description: "This is SLO 01."
labels:
global02k1: global02v1
sli:
events:
error_query: sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
total_query: sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
name: myServiceAlert
labels:
alert01k1: "alert01v1"
annotations:
alert02k1: "alert02k2"
pageAlert:
labels:
alert03k1: "alert03v1"
ticketAlert:
labels:
alert04k1: "alert04v1"
- name: "slo02"
objective: 95
description: "This is SLO 02."
labels:
global03k1: global03v1
sli:
raw:
error_ratio_query: |
sum(rate(http_request_duration_seconds_count{job="myservice",code=~"(5..|429)"}[{{.window}}]))
/
sum(rate(http_request_duration_seconds_count{job="myservice"}[{{.window}}]))
alerting:
page_alert:
disable: true
ticket_alert:
disable: true
Loading

0 comments on commit 1b76acb

Please sign in to comment.