Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-v1.58] Add new Prometheus alerts and label existing alerts #3040

Merged
merged 1 commit into from
Dec 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions automation/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ fi
echo "Nodes are ready:"
kubectl get nodes

if [ "$KUBEVIRT_STORAGE" == "hpp" ] && [ "$CDI_E2E_FOCUS" == "Destructive" ]; then
kubectl apply -f tests/manifests/snapshot
fi

make cluster-sync

kubectl version
Expand Down
5 changes: 2 additions & 3 deletions cmd/cdi-controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,10 +367,9 @@ func getTokenPrivateKey() *rsa.PrivateKey {
}

func registerMetrics() {
metrics.Registry.MustRegister(controller.IncompleteProfileGauge)
controller.IncompleteProfileGauge.Set(-1)
metrics.Registry.MustRegister(controller.DefaultVirtStorageClassesGauge)
metrics.Registry.MustRegister(controller.StorageProfileStatusGaugeVec)
metrics.Registry.MustRegister(controller.DataImportCronOutdatedGauge)
metrics.Registry.MustRegister(dvc.DataVolumePendingGauge)
}

// Restricts some types in the cache's ListWatch to specific fields/labels per GVK at the specified object,
Expand Down
10 changes: 4 additions & 6 deletions doc/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,14 @@ The number of CDI clone pods with high restart count. Type: Gauge.
CDI install ready. Type: Gauge.
### kubevirt_cdi_dataimportcron_outdated
DataImportCron has an outdated import. Type: Gauge.
### kubevirt_cdi_dataimportcron_outdated_aggregated
Total count of outdated DataImportCron imports. Type: Gauge.
### kubevirt_cdi_default_virt_storageclasses
Number of default virt storage classes currently configured. Type: Gauge.
### kubevirt_cdi_datavolume_pending
Number of DataVolumes pending for default storage class to be configured. Type: Gauge.
### kubevirt_cdi_import_pods_high_restart
The number of CDI import pods with high restart count. Type: Gauge.
### kubevirt_cdi_incomplete_storageprofiles
Total number of incomplete and hence unusable StorageProfile. Type: Gauge.
### kubevirt_cdi_operator_up
CDI operator status. Type: Gauge.
### kubevirt_cdi_storageprofile_info
`StorageProfiles` info labels: `storageclass`, `provisioner`, `complete` indicates if all storage profiles recommended PVC settings are complete, `default` indicates if it's the Kubernetes default storage class, `virtdefault` indicates if it's the default virtualization storage class, `rwx` indicates if the storage class supports `ReadWriteMany`, `smartclone` indicates if it supports snapshot or CSI based clone. Type: Gauge.
### kubevirt_cdi_upload_pods_high_restart
The number of CDI upload server pods with high restart count. Type: Gauge.
## Developing new metrics
Expand Down
22 changes: 17 additions & 5 deletions pkg/controller/common/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,19 @@ func GetVolumeMode(pvc *corev1.PersistentVolumeClaim) corev1.PersistentVolumeMod
return util.ResolveVolumeMode(pvc.Spec.VolumeMode)
}

// GetStorageClassFromDVSpec returns the StorageClassName from DataVolume PVC or Storage spec
func GetStorageClassFromDVSpec(dv *cdiv1.DataVolume) *string {
if dv.Spec.PVC != nil {
return dv.Spec.PVC.StorageClassName
}

if dv.Spec.Storage != nil {
return dv.Spec.Storage.StorageClassName
}

return nil
}

// getStorageClassByName looks up the storage class based on the name.
// If name is nil, it performs fallback to default according to the provided content type
// If no storage class is found, returns nil
Expand Down Expand Up @@ -485,7 +498,7 @@ func GetStorageClassByNameWithVirtFallback(ctx context.Context, client client.Cl
return getStorageClassByName(ctx, client, name, contentType)
}

// getFallbackStorageClass looks for a default virt/k8s storage class according to the boolean
// getFallbackStorageClass looks for a default virt/k8s storage class according to the content type
// If no storage class is found, returns nil
func getFallbackStorageClass(ctx context.Context, client client.Client, contentType cdiv1.DataVolumeContentType) (*storagev1.StorageClass, error) {
storageClasses := &storagev1.StorageClassList{}
Expand All @@ -495,16 +508,15 @@ func getFallbackStorageClass(ctx context.Context, client client.Client, contentT
}

if GetContentType(contentType) == cdiv1.DataVolumeKubeVirt {
virtSc := GetPlatformDefaultStorageClass(ctx, storageClasses, AnnDefaultVirtStorageClass)
if virtSc != nil {
if virtSc := GetPlatformDefaultStorageClass(storageClasses, AnnDefaultVirtStorageClass); virtSc != nil {
return virtSc, nil
}
}
return GetPlatformDefaultStorageClass(ctx, storageClasses, AnnDefaultStorageClass), nil
return GetPlatformDefaultStorageClass(storageClasses, AnnDefaultStorageClass), nil
}

// GetPlatformDefaultStorageClass returns the default storage class according to the provided annotation or nil if none found
func GetPlatformDefaultStorageClass(ctx context.Context, storageClasses *storagev1.StorageClassList, defaultAnnotationKey string) *storagev1.StorageClass {
func GetPlatformDefaultStorageClass(storageClasses *storagev1.StorageClassList, defaultAnnotationKey string) *storagev1.StorageClass {
defaultClasses := []storagev1.StorageClass{}

for _, storageClass := range storageClasses.Items {
Expand Down
4 changes: 3 additions & 1 deletion pkg/controller/dataimportcron-conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ package controller

import (
"github.com/prometheus/client_golang/prometheus"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"

cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1"
)
Expand All @@ -40,7 +42,7 @@ func updateDataImportCronCondition(cron *cdiv1.DataImportCron, conditionType cdi
if status != corev1.ConditionTrue {
gaugeVal = 1
}
DataImportCronOutdatedGauge.With(getPrometheusCronLabels(types.NamespacedName{Namespace: cron.Namespace, Name: cron.Name})).Set(gaugeVal)
DataImportCronOutdatedGauge.With(getPrometheusCronLabels(client.ObjectKeyFromObject(cron))).Set(gaugeVal)
}
if condition := FindDataImportCronConditionByType(cron, conditionType); condition != nil {
updateConditionState(&condition.ConditionState, status, message, reason)
Expand Down
18 changes: 3 additions & 15 deletions pkg/controller/dataimportcron-controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ func (r *DataImportCronReconciler) update(ctx context.Context, dataImportCron *c
importSucceeded := false

dataVolume := dataImportCron.Spec.Template
explicitScName := getStorageClassFromTemplate(&dataVolume)
explicitScName := cc.GetStorageClassFromDVSpec(&dataVolume)
desiredStorageClass, err := cc.GetStorageClassByNameWithVirtFallback(ctx, r.client, explicitScName, dataVolume.Spec.ContentType)
if err != nil {
return res, err
Expand Down Expand Up @@ -777,18 +777,6 @@ func (r *DataImportCronReconciler) getSourceFormat(ctx context.Context, desiredS
return format, nil
}

func getStorageClassFromTemplate(dataVolume *cdiv1.DataVolume) *string {
if dataVolume.Spec.PVC != nil {
return dataVolume.Spec.PVC.StorageClassName
}

if dataVolume.Spec.Storage != nil {
return dataVolume.Spec.Storage.StorageClassName
}

return nil
}

func (r *DataImportCronReconciler) garbageCollectOldImports(ctx context.Context, cron *cdiv1.DataImportCron) error {
if cron.Spec.GarbageCollect != nil && *cron.Spec.GarbageCollect != cdiv1.DataImportCronGarbageCollectOutdated {
return nil
Expand Down Expand Up @@ -896,7 +884,7 @@ func (r *DataImportCronReconciler) garbageCollectSnapshots(ctx context.Context,

func (r *DataImportCronReconciler) cleanup(ctx context.Context, cron types.NamespacedName) error {
// Don't keep alerting over a cron thats being deleted, will get set back to 1 again by reconcile loop if needed.
DataImportCronOutdatedGauge.With(getPrometheusCronLabels(cron)).Set(0)
DataImportCronOutdatedGauge.DeletePartialMatch(getPrometheusCronLabels(cron))
if err := r.deleteJobs(ctx, cron); err != nil {
return err
}
Expand Down Expand Up @@ -1009,7 +997,7 @@ func addDataImportCronControllerWatches(mgr manager.Manager, c controller.Contro
scName := obj.GetName()
for _, cron := range crons.Items {
dataVolume := cron.Spec.Template
explicitScName := getStorageClassFromTemplate(&dataVolume)
explicitScName := cc.GetStorageClassFromDVSpec(&dataVolume)
templateSc, err := cc.GetStorageClassByNameWithVirtFallback(context.TODO(), mgr.GetClient(), explicitScName, dataVolume.Spec.ContentType)
if err != nil || templateSc == nil {
c.GetLogger().Error(err, "Unable to get storage class", "templateSc", templateSc)
Expand Down
2 changes: 2 additions & 0 deletions pkg/controller/datavolume/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ go_library(
"//pkg/controller/common:go_default_library",
"//pkg/controller/populators:go_default_library",
"//pkg/feature-gates:go_default_library",
"//pkg/monitoring:go_default_library",
"//pkg/token:go_default_library",
"//pkg/util:go_default_library",
"//staging/src/kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1:go_default_library",
"//vendor/github.com/go-logr/logr:go_default_library",
"//vendor/github.com/kubernetes-csi/external-snapshotter/client/v6/apis/volumesnapshot/v1:go_default_library",
"//vendor/github.com/pkg/errors:go_default_library",
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
"//vendor/k8s.io/api/authorization/v1:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/api/storage/v1:go_default_library",
Expand Down
36 changes: 35 additions & 1 deletion pkg/controller/datavolume/controller-base.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (

"github.com/go-logr/logr"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"

corev1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
Expand All @@ -50,6 +51,7 @@ import (
"kubevirt.io/containerized-data-importer/pkg/common"
cc "kubevirt.io/containerized-data-importer/pkg/controller/common"
featuregates "kubevirt.io/containerized-data-importer/pkg/feature-gates"
"kubevirt.io/containerized-data-importer/pkg/monitoring"
"kubevirt.io/containerized-data-importer/pkg/token"
"kubevirt.io/containerized-data-importer/pkg/util"
)
Expand All @@ -76,7 +78,17 @@ const (
claimStorageClassNameField = "spec.storageClassName"
)

var httpClient *http.Client
var (
httpClient *http.Client

// DataVolumePendingGauge is the metric we use to count the DataVolumes pending for default storage class to be configured
DataVolumePendingGauge = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: monitoring.MetricOptsList[monitoring.DataVolumePending].Name,
Help: monitoring.MetricOptsList[monitoring.DataVolumePending].Help,
},
)
)

// Event represents DV controller event
type Event struct {
Expand Down Expand Up @@ -241,6 +253,7 @@ func addDataVolumeControllerCommonWatches(mgr manager.Manager, dataVolumeControl
if getDataVolumeOp(mgr.GetLogger(), dv, mgr.GetClient()) != op {
return nil
}
updatePendingDataVolumesGauge(mgr.GetLogger(), dv, mgr.GetClient())
return []reconcile.Request{{NamespacedName: types.NamespacedName{Namespace: dv.Namespace, Name: dv.Name}}}
}),
); err != nil {
Expand Down Expand Up @@ -291,6 +304,7 @@ func addDataVolumeControllerCommonWatches(mgr manager.Manager, dataVolumeControl
}
}

// FIXME: Consider removing this Watch. Not sure it's needed anymore as PVCs are Pending in this case.
// Watch for SC updates and reconcile the DVs waiting for default SC
if err := dataVolumeController.Watch(&source.Kind{Type: &storagev1.StorageClass{}}, handler.EnqueueRequestsFromMapFunc(
func(obj client.Object) (reqs []reconcile.Request) {
Expand Down Expand Up @@ -388,6 +402,26 @@ func getSourceRefOp(log logr.Logger, dv *cdiv1.DataVolume, client client.Client)
}
}

func updatePendingDataVolumesGauge(log logr.Logger, dv *cdiv1.DataVolume, c client.Client) {
if cc.GetStorageClassFromDVSpec(dv) != nil {
return
}

dvList := &cdiv1.DataVolumeList{}
if err := c.List(context.TODO(), dvList, client.MatchingFields{dvPhaseField: string(cdiv1.Pending)}); err != nil {
log.V(3).Error(err, "Failed listing the pending DataVolumes")
return
}

dvCount := 0
for _, dv := range dvList.Items {
if cc.GetStorageClassFromDVSpec(&dv) == nil {
dvCount++
}
}
DataVolumePendingGauge.Set(float64(dvCount))
}

type dvController interface {
reconcile.Reconciler
sync(log logr.Logger, req reconcile.Request) (dvSyncResult, error)
Expand Down
Loading