Skip to content

Commit

Permalink
Suppress alerts to reduce noise of dependent ones
Browse files Browse the repository at this point in the history
This is a follow-up to kubevirt#2998 introducing the following changes to alert
rules:

- CDIDefaultStorageClassDegraded - do not fire when no default SC
  (either k8s or virt).
- CDIDataImportCronOutdated - do not fire when no default SC (either
  k8s or virt), as DIC import DVs use default SC.
- CDINoDefaultStorageClass - fire not only when there is a pending DV
  (and PVC) but also when DV has an empty status (waiting for default
  SC etc.)

Signed-off-by: Arnon Gilboa <agilboa@redhat.com>
  • Loading branch information
arnongilboa authored and kubevirt-bot committed Apr 8, 2024
1 parent 4394617 commit 731f965
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 25 deletions.
21 changes: 18 additions & 3 deletions pkg/controller/datavolume/controller-base.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,19 +412,34 @@ func updatePendingDataVolumesGauge(ctx context.Context, log logr.Logger, dv *cdi
return
}

dvList := &cdiv1.DataVolumeList{}
if err := c.List(ctx, dvList, client.MatchingFields{dvPhaseField: string(cdiv1.Pending)}); err != nil {
countPending, err := getDefaultStorageClassDataVolumeCount(ctx, c, string(cdiv1.Pending))
if err != nil {
log.V(3).Error(err, "Failed listing the pending DataVolumes")
return
}
countUnset, err := getDefaultStorageClassDataVolumeCount(ctx, c, string(cdiv1.PhaseUnset))
if err != nil {
log.V(3).Error(err, "Failed listing the unset DataVolumes")
return
}

metrics.SetDataVolumePending(countPending + countUnset)
}

func getDefaultStorageClassDataVolumeCount(ctx context.Context, c client.Client, dvPhase string) (int, error) {
dvList := &cdiv1.DataVolumeList{}
if err := c.List(ctx, dvList, client.MatchingFields{dvPhaseField: dvPhase}); err != nil {
return 0, err
}

dvCount := 0
for _, dv := range dvList.Items {
if cc.GetStorageClassFromDVSpec(&dv) == nil {
dvCount++
}
}
metrics.SetDataVolumePending(dvCount)

return dvCount, nil
}

type dvController interface {
Expand Down
8 changes: 5 additions & 3 deletions pkg/monitoring/rules/alerts/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ var operatorAlerts = []promv1.Rule{
},
{
Alert: "CDIDataImportCronOutdated",
Expr: intstr.FromString("sum by(ns,cron_name) (kubevirt_cdi_dataimportcron_outdated) > 0"),
For: (*promv1.Duration)(ptr.To("15m")),
Expr: intstr.FromString(`sum by(ns,cron_name) (kubevirt_cdi_dataimportcron_outdated) +
on () (0*(sum(kubevirt_cdi_storageprofile_info{default="true"}) or sum(kubevirt_cdi_storageprofile_info{virtdefault="true"}))) > 0`),
For: (*promv1.Duration)(ptr.To("15m")),
Annotations: map[string]string{
"summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule",
},
Expand Down Expand Up @@ -96,7 +97,8 @@ var operatorAlerts = []promv1.Rule{
{
Alert: "CDIDefaultStorageClassDegraded",
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{default="true",rwx="true",smartclone="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true",rwx="true",smartclone="true"} or on() vector(0)) == 0`),
sum(kubevirt_cdi_storageprofile_info{virtdefault="true",rwx="true",smartclone="true"} or on() vector(0)) +
on () (0*(sum(kubevirt_cdi_storageprofile_info{default="true"}) or sum(kubevirt_cdi_storageprofile_info{virtdefault="true"}))) == 0`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "Default storage class has no smart clone or ReadWriteMany",
Expand Down
97 changes: 78 additions & 19 deletions tests/monitoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ var _ = Describe("[Destructive] Monitoring Tests", Serial, func() {
waitForNoPrometheusAlert(f, "CDIMultipleDefaultVirtStorageClasses")
})

It("[test_id:10719]CDINoDefaultStorageClass fired when no default storage class exists, and a DataVolume is waiting for one", func() {
DescribeTable("CDINoDefaultStorageClass fired when no default storage class exists, and a DataVolume is waiting for one", func(withAccessModes bool) {
By("Ensure initial metric values")
defaultSCs := countMetricLabelValue(f, "kubevirt_cdi_storageprofile_info", "default", "true")
Expect(defaultSCs).To(Equal(1))
Expand All @@ -301,6 +301,9 @@ var _ = Describe("[Destructive] Monitoring Tests", Serial, func() {
updateDefaultStorageClasses("false")

dv := utils.NewDataVolumeWithHTTPImportAndStorageSpec("test-dv", "1Gi", fmt.Sprintf(utils.TinyCoreQcow2URL, f.CdiInstallNs))
if !withAccessModes {
dv.Spec.Storage.AccessModes = nil
}
_, err := utils.CreateDataVolumeFromDefinition(f.CdiClient, f.Namespace.Name, dv)
Expect(err).ToNot(HaveOccurred())

Expand Down Expand Up @@ -331,9 +334,12 @@ var _ = Describe("[Destructive] Monitoring Tests", Serial, func() {
}, metricPollingTimeout, metricPollingInterval).Should(BeZero())

waitForNoPrometheusAlert(f, "CDINoDefaultStorageClass")
})
},
Entry("[test_id:10719]with AccessModes, so Pending PVC is created", true),
Entry("[test_id:XXXXX]without AccessModes, so PVC is not created until default storage class exists", false),
)

It("[test_id:10720]CDIDefaultStorageClassDegraded fired when default storage class has no smart clone or ReadWriteMany", func() {
It("[test_id:10720]CDIDefaultStorageClassDegraded fired when there is a default storage class, but it has no smart clone or ReadWriteMany", func() {
rwx := corev1.ReadWriteMany
rwo := corev1.ReadWriteOnce
isStubSnapshotClass := false
Expand Down Expand Up @@ -369,6 +375,14 @@ var _ = Describe("[Destructive] Monitoring Tests", Serial, func() {

waitForPrometheusAlert(f, "CDIDefaultStorageClassDegraded")

By("Verify the alert stops when no default storage class")
updateDefaultStorageClasses("false")
waitForNoPrometheusAlert(f, "CDIDefaultStorageClassDegraded")

By("Verify the alert fires when default storage class is set")
updateDefaultStorageClasses("true")
waitForPrometheusAlert(f, "CDIDefaultStorageClassDegraded")

By("Restore storage profile")
if hasRWX {
updateDefaultStorageClassProfileClaimPropertySets(nil)
Expand Down Expand Up @@ -458,6 +472,47 @@ var _ = Describe("[Destructive] Monitoring Tests", Serial, func() {
waitForNoPrometheusAlert(f, "CDIDataImportCronOutdated")
})

It("[test_id:XXXXX] DataImportCron failing metric expected value but no alert when no default storage class", func() {
numCrons := 2
originalCronMetricVal := getMetricValueWithDefault(f, "kubevirt_cdi_dataimportcron_outdated", true)
Expect(originalCronMetricVal).To(BeZero())

waitForNoPrometheusAlert(f, "CDIDataImportCronOutdated")

reg, err := getDataVolumeSourceRegistry(f)
Expect(err).ToNot(HaveOccurred())
defer func() {
if err := utils.RemoveInsecureRegistry(f.CrClient, *reg.URL); err != nil {
_, _ = fmt.Fprintf(GinkgoWriter, "failed to remove registry; %v", err)
}
}()

updateDefaultStorageClasses("false")

for i := 1; i <= numCrons; i++ {
cron := utils.NewDataImportCron(fmt.Sprintf("cron-test-%d", i), "5Gi", scheduleEveryMinute, fmt.Sprintf("datasource-test-%d", i), 1, *reg)
By(fmt.Sprintf("Create new DataImportCron %s", *reg.URL))
_, err = f.CdiClient.CdiV1beta1().DataImportCrons(f.Namespace.Name).Create(context.TODO(), cron, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

By(fmt.Sprintf("Ensuring metric value incremented to %d", i))
Eventually(func() int {
return getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated")
}, metricPollingTimeout, metricPollingInterval).Should(BeNumerically("==", i))
}

By("Verify no CDIDataImportCronOutdated alert")
Consistently(func() bool {
return checkPrometheusAlertExists(f, "CDIDataImportCronOutdated", false)
}, metricConsistentPollingTimeout, metricPollingInterval).Should(BeTrue())

updateDefaultStorageClasses("true")
By("Ensuring metric value decremented to zero")
Eventually(func() int {
return getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated")
}, metricPollingTimeout, metricPollingInterval).Should(BeZero())
})

It("[test_id:7962] CDIOperatorDown alert firing when operator scaled down", func() {
deploymentName := "cdi-operator"
By("Scale down operator so alert will trigger")
Expand Down Expand Up @@ -692,26 +747,30 @@ func waitForNoPrometheusAlert(f *framework.Framework, alertName string) {

func waitForPrometheusAlertExists(f *framework.Framework, alertName string, shouldExist bool) {
Eventually(func() bool {
result, err := getPrometheusAlerts(f)
if err != nil {
return false
}
return checkPrometheusAlertExists(f, alertName, shouldExist)
}, 2*time.Minute, 1*time.Second).Should(BeTrue())
}

alerts := result["data"].(map[string]interface{})["alerts"].([]interface{})
for _, alert := range alerts {
name := alert.(map[string]interface{})["labels"].(map[string]interface{})["alertname"].(string)
if name == alertName {
if shouldExist {
state := alert.(map[string]interface{})["state"].(string)
By(fmt.Sprintf("Alert %s state %s", name, state))
return state == "pending" || state == "firing"
}
return false
func checkPrometheusAlertExists(f *framework.Framework, alertName string, shouldExist bool) bool {
result, err := getPrometheusAlerts(f)
if err != nil {
return false
}

alerts := result["data"].(map[string]interface{})["alerts"].([]interface{})
for _, alert := range alerts {
name := alert.(map[string]interface{})["labels"].(map[string]interface{})["alertname"].(string)
if name == alertName {
if shouldExist {
state := alert.(map[string]interface{})["state"].(string)
By(fmt.Sprintf("Alert %s state %s", name, state))
return state == "pending" || state == "firing"
}
return false
}
}

return !shouldExist
}, 2*time.Minute, 1*time.Second).Should(BeTrue())
return !shouldExist
}

func getPrometheusAlerts(f *framework.Framework) (alerts map[string]interface{}, err error) {
Expand Down

0 comments on commit 731f965

Please sign in to comment.