Skip to content

Commit

Permalink
Add label with access mode to SELinux metrics
Browse files Browse the repository at this point in the history
In the KEP 1710 we promised to have all SELinux metrics with access mode
label, so cluster admin is able to distinguish when RWOP volumes are
failing to mount (-> SELinuxMountReadWriteOncePod feature gate must be
disabled) or volumes with any other access modes are failing (->
SELinuxMount feature gate must be disabled).

Adding the label to kubelet is quite straightforward, there were some
changes needed in the e2e test. Now grabMetrics() collects values of all
SELinux related metrics with all labels. It only skips unrelated volume
plugins. And waitForMetricIncrease gets metric with all labels on input, so
it can check that say RWOP metric increased and RWX one did not.
  • Loading branch information
jsafrane committed Mar 4, 2024
1 parent a4eaf6e commit c4163a9
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,53 +25,61 @@ import (

var (
// TODO: add plugin name + access mode labels to all these metrics
seLinuxContainerContextErrors = compbasemetrics.NewGauge(
seLinuxContainerContextErrors = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "volume_manager_selinux_container_errors_total",
Help: "Number of errors when kubelet cannot compute SELinux context for a container. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of containers.",
StabilityLevel: compbasemetrics.ALPHA,
})
seLinuxContainerContextWarnings = compbasemetrics.NewGauge(
},
[]string{"access_mode"},
)
seLinuxContainerContextWarnings = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "volume_manager_selinux_container_warnings_total",
StabilityLevel: compbasemetrics.ALPHA,
Help: "Number of errors when kubelet cannot compute SELinux context for a container that are ignored. They will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.",
})
seLinuxPodContextMismatchErrors = compbasemetrics.NewGauge(
},
[]string{"access_mode"},
)
seLinuxPodContextMismatchErrors = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "volume_manager_selinux_pod_context_mismatch_errors_total",
Help: "Number of errors when a Pod defines different SELinux contexts for its containers that use the same volume. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.",
StabilityLevel: compbasemetrics.ALPHA,
})
seLinuxPodContextMismatchWarnings = compbasemetrics.NewGauge(
},
[]string{"access_mode"},
)
seLinuxPodContextMismatchWarnings = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "volume_manager_selinux_pod_context_mismatch_warnings_total",
Help: "Number of errors when a Pod defines different SELinux contexts for its containers that use the same volume. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.",
StabilityLevel: compbasemetrics.ALPHA,
})
},
[]string{"access_mode"},
)
seLinuxVolumeContextMismatchErrors = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "volume_manager_selinux_volume_context_mismatch_errors_total",
Help: "Number of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"volume_plugin"},
[]string{"volume_plugin", "access_mode"},
)
seLinuxVolumeContextMismatchWarnings = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "volume_manager_selinux_volume_context_mismatch_warnings_total",
Help: "Number of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"volume_plugin"},
[]string{"volume_plugin", "access_mode"},
)
seLinuxVolumesAdmitted = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "volume_manager_selinux_volumes_admitted_total",
Help: "Number of volumes whose SELinux context was fine and will be mounted with mount -o context option.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"volume_plugin"},
[]string{"volume_plugin", "access_mode"},
)

registerMetrics sync.Once
Expand Down
47 changes: 42 additions & 5 deletions pkg/kubelet/volumemanager/cache/desired_state_of_world.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ func (dsw *desiredStateOfWorld) AddPodToVolume(
err)
}
volumePluginName := getVolumePluginNameWithDriver(volumePlugin, volumeSpec)
accessMode := getVolumeAccessMode(volumeSpec)

var volumeName v1.UniqueVolumeName

Expand Down Expand Up @@ -328,7 +329,7 @@ func (dsw *desiredStateOfWorld) AddPodToVolume(
effectiveSELinuxMountLabel = ""
}
if seLinuxFileLabel != "" {
seLinuxVolumesAdmitted.WithLabelValues(volumePluginName).Add(1.0)
seLinuxVolumesAdmitted.WithLabelValues(volumePluginName, accessMode).Add(1.0)
}
vmt := volumeToMount{
volumeName: volumeName,
Expand Down Expand Up @@ -369,8 +370,8 @@ func (dsw *desiredStateOfWorld) AddPodToVolume(
err := handleSELinuxMetricError(
fullErr,
supported,
seLinuxVolumeContextMismatchWarnings.WithLabelValues(volumePluginName),
seLinuxVolumeContextMismatchErrors.WithLabelValues(volumePluginName))
seLinuxVolumeContextMismatchWarnings.WithLabelValues(volumePluginName, accessMode),
seLinuxVolumeContextMismatchErrors.WithLabelValues(volumePluginName, accessMode))
if err != nil {
return "", err
}
Expand Down Expand Up @@ -414,7 +415,13 @@ func (dsw *desiredStateOfWorld) getSELinuxLabel(volumeSpec *volume.Spec, seLinux
newLabel, err := dsw.seLinuxTranslator.SELinuxOptionsToFileLabel(containerContext)
if err != nil {
fullErr := fmt.Errorf("failed to construct SELinux label from context %q: %s", containerContext, err)
if err := handleSELinuxMetricError(fullErr, seLinuxSupported, seLinuxContainerContextWarnings, seLinuxContainerContextErrors); err != nil {
accessMode := getVolumeAccessMode(volumeSpec)
err := handleSELinuxMetricError(
fullErr,
seLinuxSupported,
seLinuxContainerContextWarnings.WithLabelValues(accessMode),
seLinuxContainerContextErrors.WithLabelValues(accessMode))
if err != nil {
return "", false, err
}
}
Expand All @@ -423,8 +430,15 @@ func (dsw *desiredStateOfWorld) getSELinuxLabel(volumeSpec *volume.Spec, seLinux
continue
}
if seLinuxFileLabel != newLabel {
accessMode := getVolumeAccessMode(volumeSpec)

fullErr := fmt.Errorf("volume %s is used with two different SELinux contexts in the same pod: %q, %q", volumeSpec.Name(), seLinuxFileLabel, newLabel)
if err := handleSELinuxMetricError(fullErr, seLinuxSupported, seLinuxPodContextMismatchWarnings, seLinuxPodContextMismatchErrors); err != nil {
err := handleSELinuxMetricError(
fullErr,
seLinuxSupported,
seLinuxPodContextMismatchWarnings.WithLabelValues(accessMode),
seLinuxPodContextMismatchErrors.WithLabelValues(accessMode))
if err != nil {
return "", false, err
}
}
Expand Down Expand Up @@ -685,3 +699,26 @@ func getVolumePluginNameWithDriver(plugin volume.VolumePlugin, spec *volume.Spec
// `/` is used to separate plugin + CSI driver in util.GetUniqueVolumeName() too
return pluginName + "/" + driverName
}

func getVolumeAccessMode(spec *volume.Spec) string {
if spec.PersistentVolume == nil {
// In-line volumes in pod do not have a specific access mode, using "inline".
return "inline"
}
// For purpose of this PR, report only the "highest" access mode in this order: RWX (highest priority), ROX, RWO, RWOP (lowest priority
pv := spec.PersistentVolume
if util.ContainsAccessMode(pv.Spec.AccessModes, v1.ReadWriteMany) {
return "RWX"
}
if util.ContainsAccessMode(pv.Spec.AccessModes, v1.ReadOnlyMany) {
return "ROX"
}
if util.ContainsAccessMode(pv.Spec.AccessModes, v1.ReadWriteOnce) {
return "RWO"
}
if util.ContainsAccessMode(pv.Spec.AccessModes, v1.ReadWriteOncePod) {
return "RWOP"
}
// This should not happen, validation does not allow empty or unknown AccessModes.
return ""
}
109 changes: 72 additions & 37 deletions test/e2e/storage/csi_mock/csi_selinux_mount.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ import (
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/component-base/metrics/testutil"
"k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/test/e2e/feature"
Expand Down Expand Up @@ -316,6 +318,24 @@ var _ = utils.SIGDescribe("CSI Mock selinux on mount", func() {
})
})

var (
// SELinux metrics that have volume_plugin and access_mode labels
metricsWithVolumePluginLabel = sets.NewString(
"volume_manager_selinux_volume_context_mismatch_errors_total",
"volume_manager_selinux_volume_context_mismatch_warnings_total",
"volume_manager_selinux_volumes_admitted_total",
)
// SELinuxMetrics that have only access_mode label
metricsWithoutVolumePluginLabel = sets.NewString(
"volume_manager_selinux_container_errors_total",
"volume_manager_selinux_container_warnings_total",
"volume_manager_selinux_pod_context_mismatch_errors_total",
"volume_manager_selinux_pod_context_mismatch_warnings_total",
)
// All SELinux metrics
allMetrics = metricsWithoutVolumePluginLabel.Union(metricsWithVolumePluginLabel)
)

var _ = utils.SIGDescribe("CSI Mock selinux on mount metrics", func() {
f := framework.NewDefaultFramework("csi-mock-volumes-selinux-metrics")
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
Expand Down Expand Up @@ -389,6 +409,17 @@ var _ = utils.SIGDescribe("CSI Mock selinux on mount metrics", func() {
expectIncreases: sets.NewString("volume_manager_selinux_volume_context_mismatch_errors_total"),
testTags: []interface{}{framework.WithFeatureGate(features.SELinuxMountReadWriteOncePod), framework.WithFeatureGate(features.SELinuxMount)},
},
{
name: "error is bumped on two Pods with a different context on RWX volume and SELinuxMount enabled",
csiDriverSELinuxEnabled: true,
firstPodSELinuxOpts: &seLinuxOpts1,
secondPodSELinuxOpts: &seLinuxOpts2,
secondPodFailureEvent: "conflicting SELinux labels of volume",
volumeMode: v1.ReadWriteMany,
waitForSecondPodStart: false,
expectIncreases: sets.NewString("volume_manager_selinux_volume_context_mismatch_errors_total"),
testTags: []interface{}{framework.WithFeatureGate(features.SELinuxMountReadWriteOncePod), framework.WithFeatureGate(features.SELinuxMount)},
},
{
name: "error is bumped on two Pods with a different context on RWOP volume",
csiDriverSELinuxEnabled: true,
Expand All @@ -405,19 +436,7 @@ var _ = utils.SIGDescribe("CSI Mock selinux on mount metrics", func() {
t := t
testFunc := func(ctx context.Context) {
// Some metrics use CSI driver name as a label, which is "csi-mock-" + the namespace name.
volumePluginLabel := "{volume_plugin=\"kubernetes.io/csi/csi-mock-" + f.Namespace.Name + "\"}"

// All SELinux metrics. Unless explicitly mentioned in test.expectIncreases, these metrics must not grow during
// a test.
allMetrics := sets.NewString(
"volume_manager_selinux_container_errors_total",
"volume_manager_selinux_container_warnings_total",
"volume_manager_selinux_pod_context_mismatch_errors_total",
"volume_manager_selinux_pod_context_mismatch_warnings_total",
"volume_manager_selinux_volume_context_mismatch_errors_total"+volumePluginLabel,
"volume_manager_selinux_volume_context_mismatch_warnings_total"+volumePluginLabel,
"volume_manager_selinux_volumes_admitted_total"+volumePluginLabel,
)
volumePluginLabel := "volume_plugin=\"kubernetes.io/csi/csi-mock-" + f.Namespace.Name + "\""

if framework.NodeOSDistroIs("windows") {
e2eskipper.Skipf("SELinuxMount is only applied on linux nodes -- skipping")
Expand All @@ -444,7 +463,7 @@ var _ = utils.SIGDescribe("CSI Mock selinux on mount metrics", func() {
ginkgo.By("Grabbing initial metrics")
pod, err = m.cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
framework.ExpectNoError(err, "getting the initial pod")
metrics, err := grabMetrics(ctx, grabber, pod.Spec.NodeName, allMetrics)
metrics, err := grabMetrics(ctx, grabber, pod.Spec.NodeName, allMetrics, volumePluginLabel)
framework.ExpectNoError(err, "collecting the initial metrics")
dumpMetrics(metrics)

Expand Down Expand Up @@ -472,8 +491,9 @@ var _ = utils.SIGDescribe("CSI Mock selinux on mount metrics", func() {
}

// Assert: count the metrics
ginkgo.By("Waiting for expected metric changes")
err = waitForMetricIncrease(ctx, grabber, pod.Spec.NodeName, allMetrics, t.expectIncreases, metrics, framework.PodStartShortTimeout)
expectIncreaseWithLabels := addLabels(t.expectIncreases, volumePluginLabel, t.volumeMode)
framework.Logf("Waiting for changes of metrics %+v", expectIncreaseWithLabels)
err = waitForMetricIncrease(ctx, grabber, pod.Spec.NodeName, volumePluginLabel, allMetrics, expectIncreaseWithLabels, metrics, framework.PodStartShortTimeout)
framework.ExpectNoError(err, "waiting for metrics %s to increase", t.expectIncreases)
}
// t.testTags is array and it's not possible to use It("name", func(){xxx}, t.testTags...)
Expand All @@ -488,7 +508,7 @@ var _ = utils.SIGDescribe("CSI Mock selinux on mount metrics", func() {
})
})

func grabMetrics(ctx context.Context, grabber *e2emetrics.Grabber, nodeName string, metricNames sets.String) (map[string]float64, error) {
func grabMetrics(ctx context.Context, grabber *e2emetrics.Grabber, nodeName string, metricNames sets.String, volumePluginLabel string) (map[string]float64, error) {
response, err := grabber.GrabFromKubelet(ctx, nodeName)
framework.ExpectNoError(err)

Expand All @@ -497,45 +517,41 @@ func grabMetrics(ctx context.Context, grabber *e2emetrics.Grabber, nodeName stri
if len(samples) == 0 {
continue
}
// Find the *last* sample that has the label we are interested in.
for i := len(samples) - 1; i >= 0; i-- {
metricNameWithLabels := samples[i].Metric.String()
if metricNames.Has(metricNameWithLabels) {
// For each metric + label combination, remember the last sample
for i := range samples {
// E.g. "volume_manager_selinux_pod_context_mismatch_errors_total"
metricName := samples[i].Metric[testutil.MetricNameLabel]
if metricNames.Has(string(metricName)) {
// E.g. "volume_manager_selinux_pod_context_mismatch_errors_total{access_mode="RWOP",volume_plugin="kubernetes.io/csi/csi-mock-ns"}
metricNameWithLabels := samples[i].Metric.String()
// Filter out metrics of any other volume plugin
if strings.Contains(metricNameWithLabels, "volume_plugin=") && !strings.Contains(metricNameWithLabels, volumePluginLabel) {
continue
}
// Overwrite any previous value, so only the last one is stored.
metrics[metricNameWithLabels] = float64(samples[i].Value)
break
}
}
}

return metrics, nil
}

func waitForMetricIncrease(ctx context.Context, grabber *e2emetrics.Grabber, nodeName string, allMetricNames, expectedIncreaseNames sets.String, initialValues map[string]float64, timeout time.Duration) error {
func waitForMetricIncrease(ctx context.Context, grabber *e2emetrics.Grabber, nodeName string, volumePluginLabel string, allMetricNames, expectedIncreaseNames sets.String, initialValues map[string]float64, timeout time.Duration) error {
var noIncreaseMetrics sets.String
var metrics map[string]float64

err := wait.Poll(time.Second, timeout, func() (bool, error) {
var err error
metrics, err = grabMetrics(ctx, grabber, nodeName, allMetricNames)
metrics, err = grabMetrics(ctx, grabber, nodeName, allMetricNames, volumePluginLabel)
if err != nil {
return false, err
}

noIncreaseMetrics = sets.NewString()
// Always evaluate all SELinux metrics to check that the other metrics are not unexpectedly increased.
for name := range allMetricNames {
expectIncrease := false

// allMetricNames can include "{volume_plugin="XXX"}", while expectedIncreaseNames does not.
// Compare them properly. Value of volume_plugin="XXX" was already checked in grabMetrics(),
// we can ignore it here.
for expectedIncreaseMetricName := range expectedIncreaseNames {
if strings.HasPrefix(name, expectedIncreaseMetricName) {
expectIncrease = true
break
}
}
if expectIncrease {
for name := range metrics {
if expectedIncreaseNames.Has(name) {
if metrics[name] <= initialValues[name] {
noIncreaseMetrics.Insert(name)
}
Expand Down Expand Up @@ -570,3 +586,22 @@ func dumpMetrics(metrics map[string]float64) {
framework.Logf("Metric %s: %v", key, metrics[key])
}
}

// Add labels to the metric name based on the current test case
func addLabels(metricNames sets.String, volumePluginLabel string, accessMode v1.PersistentVolumeAccessMode) sets.String {
ret := sets.NewString()
accessModeShortString := helper.GetAccessModesAsString([]v1.PersistentVolumeAccessMode{accessMode})

for metricName := range metricNames {
var metricWithLabels string
if metricsWithVolumePluginLabel.Has(metricName) {
metricWithLabels = fmt.Sprintf("%s{access_mode=\"%s\", %s}", metricName, accessModeShortString, volumePluginLabel)
} else {
metricWithLabels = fmt.Sprintf("%s{access_mode=\"%s\"}", metricName, accessModeShortString)
}

ret.Insert(metricWithLabels)
}

return ret
}

0 comments on commit c4163a9

Please sign in to comment.