Skip to content

Commit

Permalink
[release-v1.58] Add new Prometheus alerts and label existing alerts (#…
Browse files Browse the repository at this point in the history
…3040)

Manual backport of #2998 & #3038

- CDINoDefaultStorageClass - not having a default (or virt default)
SC is surely not an OpenShift error, as admins may prefer their cluster
users to only use explicit SC names. However, in the CDI context when
DV is created with default SC but default does not exist, we will fire
an error event and the PVC will be Pending for the default SC, so when
there are such Pending PVCs we will fire an alert.

- CDIDefaultStorageClassDegraded - when the default (or virt default)
SC does not support CSI/Snapshot clone (smart clone) or does not have
ReadWriteMany access mode (for live migration).

- CDIStorageProfilesIncomplete - add storageClass and provisioner
labels.

- CDIDataImportCronOutdated - add dataImportCron namespace and name
labels.

Also:
* Rename the metric kubevirt_cdi_storageprofile_status to
kubevirt_cdi_storageprofile_info since it always reports value 1,
where the label values provide the details about the storage class and
storage profile.
* Add snapshot manifests for tests and deploy snapshot CRDs in the hpp
destructive lane

Signed-off-by: Arnon Gilboa <agilboa@redhat.com>
  • Loading branch information
arnongilboa committed Dec 26, 2023
1 parent 0dfa4e0 commit 10bb5e6
Show file tree
Hide file tree
Showing 19 changed files with 1,496 additions and 180 deletions.
4 changes: 4 additions & 0 deletions automation/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ fi
echo "Nodes are ready:"
kubectl get nodes

if [ "$KUBEVIRT_STORAGE" == "hpp" ] && [ "$CDI_E2E_FOCUS" == "Destructive" ]; then
kubectl apply -f tests/manifests/snapshot
fi

make cluster-sync

kubectl version
Expand Down
5 changes: 2 additions & 3 deletions cmd/cdi-controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,10 +367,9 @@ func getTokenPrivateKey() *rsa.PrivateKey {
}

func registerMetrics() {
metrics.Registry.MustRegister(controller.IncompleteProfileGauge)
controller.IncompleteProfileGauge.Set(-1)
metrics.Registry.MustRegister(controller.DefaultVirtStorageClassesGauge)
metrics.Registry.MustRegister(controller.StorageProfileStatusGaugeVec)
metrics.Registry.MustRegister(controller.DataImportCronOutdatedGauge)
metrics.Registry.MustRegister(dvc.DataVolumePendingGauge)
}

// Restricts some types in the cache's ListWatch to specific fields/labels per GVK at the specified object,
Expand Down
10 changes: 4 additions & 6 deletions doc/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,14 @@ The number of CDI clone pods with high restart count. Type: Gauge.
CDI install ready. Type: Gauge.
### kubevirt_cdi_dataimportcron_outdated
DataImportCron has an outdated import. Type: Gauge.
### kubevirt_cdi_dataimportcron_outdated_aggregated
Total count of outdated DataImportCron imports. Type: Gauge.
### kubevirt_cdi_default_virt_storageclasses
Number of default virt storage classes currently configured. Type: Gauge.
### kubevirt_cdi_datavolume_pending
Number of DataVolumes pending for default storage class to be configured. Type: Gauge.
### kubevirt_cdi_import_pods_high_restart
The number of CDI import pods with high restart count. Type: Gauge.
### kubevirt_cdi_incomplete_storageprofiles
Total number of incomplete and hence unusable StorageProfile. Type: Gauge.
### kubevirt_cdi_operator_up
CDI operator status. Type: Gauge.
### kubevirt_cdi_storageprofile_info
`StorageProfiles` info labels: `storageclass`, `provisioner`, `complete` indicates if all storage profiles recommended PVC settings are complete, `default` indicates if it's the Kubernetes default storage class, `virtdefault` indicates if it's the default virtualization storage class, `rwx` indicates if the storage class supports `ReadWriteMany`, `smartclone` indicates if it supports snapshot or CSI based clone. Type: Gauge.
### kubevirt_cdi_upload_pods_high_restart
The number of CDI upload server pods with high restart count. Type: Gauge.
## Developing new metrics
Expand Down
22 changes: 17 additions & 5 deletions pkg/controller/common/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,19 @@ func GetVolumeMode(pvc *corev1.PersistentVolumeClaim) corev1.PersistentVolumeMod
return util.ResolveVolumeMode(pvc.Spec.VolumeMode)
}

// GetStorageClassFromDVSpec returns the StorageClassName from DataVolume PVC or Storage spec
func GetStorageClassFromDVSpec(dv *cdiv1.DataVolume) *string {
if dv.Spec.PVC != nil {
return dv.Spec.PVC.StorageClassName
}

if dv.Spec.Storage != nil {
return dv.Spec.Storage.StorageClassName
}

return nil
}

// getStorageClassByName looks up the storage class based on the name.
// If name is nil, it performs fallback to default according to the provided content type
// If no storage class is found, returns nil
Expand Down Expand Up @@ -485,7 +498,7 @@ func GetStorageClassByNameWithVirtFallback(ctx context.Context, client client.Cl
return getStorageClassByName(ctx, client, name, contentType)
}

// getFallbackStorageClass looks for a default virt/k8s storage class according to the boolean
// getFallbackStorageClass looks for a default virt/k8s storage class according to the content type
// If no storage class is found, returns nil
func getFallbackStorageClass(ctx context.Context, client client.Client, contentType cdiv1.DataVolumeContentType) (*storagev1.StorageClass, error) {
storageClasses := &storagev1.StorageClassList{}
Expand All @@ -495,16 +508,15 @@ func getFallbackStorageClass(ctx context.Context, client client.Client, contentT
}

if GetContentType(contentType) == cdiv1.DataVolumeKubeVirt {
virtSc := GetPlatformDefaultStorageClass(ctx, storageClasses, AnnDefaultVirtStorageClass)
if virtSc != nil {
if virtSc := GetPlatformDefaultStorageClass(storageClasses, AnnDefaultVirtStorageClass); virtSc != nil {
return virtSc, nil
}
}
return GetPlatformDefaultStorageClass(ctx, storageClasses, AnnDefaultStorageClass), nil
return GetPlatformDefaultStorageClass(storageClasses, AnnDefaultStorageClass), nil
}

// GetPlatformDefaultStorageClass returns the default storage class according to the provided annotation or nil if none found
func GetPlatformDefaultStorageClass(ctx context.Context, storageClasses *storagev1.StorageClassList, defaultAnnotationKey string) *storagev1.StorageClass {
func GetPlatformDefaultStorageClass(storageClasses *storagev1.StorageClassList, defaultAnnotationKey string) *storagev1.StorageClass {
defaultClasses := []storagev1.StorageClass{}

for _, storageClass := range storageClasses.Items {
Expand Down
4 changes: 3 additions & 1 deletion pkg/controller/dataimportcron-conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ package controller

import (
"github.com/prometheus/client_golang/prometheus"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"

cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1"
)
Expand All @@ -40,7 +42,7 @@ func updateDataImportCronCondition(cron *cdiv1.DataImportCron, conditionType cdi
if status != corev1.ConditionTrue {
gaugeVal = 1
}
DataImportCronOutdatedGauge.With(getPrometheusCronLabels(types.NamespacedName{Namespace: cron.Namespace, Name: cron.Name})).Set(gaugeVal)
DataImportCronOutdatedGauge.With(getPrometheusCronLabels(client.ObjectKeyFromObject(cron))).Set(gaugeVal)
}
if condition := FindDataImportCronConditionByType(cron, conditionType); condition != nil {
updateConditionState(&condition.ConditionState, status, message, reason)
Expand Down
18 changes: 3 additions & 15 deletions pkg/controller/dataimportcron-controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ func (r *DataImportCronReconciler) update(ctx context.Context, dataImportCron *c
importSucceeded := false

dataVolume := dataImportCron.Spec.Template
explicitScName := getStorageClassFromTemplate(&dataVolume)
explicitScName := cc.GetStorageClassFromDVSpec(&dataVolume)
desiredStorageClass, err := cc.GetStorageClassByNameWithVirtFallback(ctx, r.client, explicitScName, dataVolume.Spec.ContentType)
if err != nil {
return res, err
Expand Down Expand Up @@ -777,18 +777,6 @@ func (r *DataImportCronReconciler) getSourceFormat(ctx context.Context, desiredS
return format, nil
}

func getStorageClassFromTemplate(dataVolume *cdiv1.DataVolume) *string {
if dataVolume.Spec.PVC != nil {
return dataVolume.Spec.PVC.StorageClassName
}

if dataVolume.Spec.Storage != nil {
return dataVolume.Spec.Storage.StorageClassName
}

return nil
}

func (r *DataImportCronReconciler) garbageCollectOldImports(ctx context.Context, cron *cdiv1.DataImportCron) error {
if cron.Spec.GarbageCollect != nil && *cron.Spec.GarbageCollect != cdiv1.DataImportCronGarbageCollectOutdated {
return nil
Expand Down Expand Up @@ -896,7 +884,7 @@ func (r *DataImportCronReconciler) garbageCollectSnapshots(ctx context.Context,

func (r *DataImportCronReconciler) cleanup(ctx context.Context, cron types.NamespacedName) error {
// Don't keep alerting over a cron thats being deleted, will get set back to 1 again by reconcile loop if needed.
DataImportCronOutdatedGauge.With(getPrometheusCronLabels(cron)).Set(0)
DataImportCronOutdatedGauge.DeletePartialMatch(getPrometheusCronLabels(cron))
if err := r.deleteJobs(ctx, cron); err != nil {
return err
}
Expand Down Expand Up @@ -1009,7 +997,7 @@ func addDataImportCronControllerWatches(mgr manager.Manager, c controller.Contro
scName := obj.GetName()
for _, cron := range crons.Items {
dataVolume := cron.Spec.Template
explicitScName := getStorageClassFromTemplate(&dataVolume)
explicitScName := cc.GetStorageClassFromDVSpec(&dataVolume)
templateSc, err := cc.GetStorageClassByNameWithVirtFallback(context.TODO(), mgr.GetClient(), explicitScName, dataVolume.Spec.ContentType)
if err != nil || templateSc == nil {
c.GetLogger().Error(err, "Unable to get storage class", "templateSc", templateSc)
Expand Down
2 changes: 2 additions & 0 deletions pkg/controller/datavolume/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ go_library(
"//pkg/controller/common:go_default_library",
"//pkg/controller/populators:go_default_library",
"//pkg/feature-gates:go_default_library",
"//pkg/monitoring:go_default_library",
"//pkg/token:go_default_library",
"//pkg/util:go_default_library",
"//staging/src/kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1:go_default_library",
"//vendor/github.com/go-logr/logr:go_default_library",
"//vendor/github.com/kubernetes-csi/external-snapshotter/client/v6/apis/volumesnapshot/v1:go_default_library",
"//vendor/github.com/pkg/errors:go_default_library",
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
"//vendor/k8s.io/api/authorization/v1:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/api/storage/v1:go_default_library",
Expand Down
36 changes: 35 additions & 1 deletion pkg/controller/datavolume/controller-base.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (

"github.com/go-logr/logr"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"

corev1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
Expand All @@ -50,6 +51,7 @@ import (
"kubevirt.io/containerized-data-importer/pkg/common"
cc "kubevirt.io/containerized-data-importer/pkg/controller/common"
featuregates "kubevirt.io/containerized-data-importer/pkg/feature-gates"
"kubevirt.io/containerized-data-importer/pkg/monitoring"
"kubevirt.io/containerized-data-importer/pkg/token"
"kubevirt.io/containerized-data-importer/pkg/util"
)
Expand All @@ -76,7 +78,17 @@ const (
claimStorageClassNameField = "spec.storageClassName"
)

var httpClient *http.Client
var (
httpClient *http.Client

// DataVolumePendingGauge is the metric we use to count the DataVolumes pending for default storage class to be configured
DataVolumePendingGauge = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: monitoring.MetricOptsList[monitoring.DataVolumePending].Name,
Help: monitoring.MetricOptsList[monitoring.DataVolumePending].Help,
},
)
)

// Event represents DV controller event
type Event struct {
Expand Down Expand Up @@ -241,6 +253,7 @@ func addDataVolumeControllerCommonWatches(mgr manager.Manager, dataVolumeControl
if getDataVolumeOp(mgr.GetLogger(), dv, mgr.GetClient()) != op {
return nil
}
updatePendingDataVolumesGauge(mgr.GetLogger(), dv, mgr.GetClient())
return []reconcile.Request{{NamespacedName: types.NamespacedName{Namespace: dv.Namespace, Name: dv.Name}}}
}),
); err != nil {
Expand Down Expand Up @@ -291,6 +304,7 @@ func addDataVolumeControllerCommonWatches(mgr manager.Manager, dataVolumeControl
}
}

// FIXME: Consider removing this Watch. Not sure it's needed anymore as PVCs are Pending in this case.
// Watch for SC updates and reconcile the DVs waiting for default SC
if err := dataVolumeController.Watch(&source.Kind{Type: &storagev1.StorageClass{}}, handler.EnqueueRequestsFromMapFunc(
func(obj client.Object) (reqs []reconcile.Request) {
Expand Down Expand Up @@ -388,6 +402,26 @@ func getSourceRefOp(log logr.Logger, dv *cdiv1.DataVolume, client client.Client)
}
}

func updatePendingDataVolumesGauge(log logr.Logger, dv *cdiv1.DataVolume, c client.Client) {
if cc.GetStorageClassFromDVSpec(dv) != nil {
return
}

dvList := &cdiv1.DataVolumeList{}
if err := c.List(context.TODO(), dvList, client.MatchingFields{dvPhaseField: string(cdiv1.Pending)}); err != nil {
log.V(3).Error(err, "Failed listing the pending DataVolumes")
return
}

dvCount := 0
for _, dv := range dvList.Items {
if cc.GetStorageClassFromDVSpec(&dv) == nil {
dvCount++
}
}
DataVolumePendingGauge.Set(float64(dvCount))
}

type dvController interface {
reconcile.Reconciler
sync(log logr.Logger, req reconcile.Request) (dvSyncResult, error)
Expand Down
Loading

0 comments on commit 10bb5e6

Please sign in to comment.