Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check stable metrics aren't changed #1836

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ LATEST_RELEASE_BRANCH := release-$(shell grep -ohE "[0-9]+.[0-9]+" VERSION)
BRANCH = $(strip $(shell git rev-parse --abbrev-ref HEAD))
DOCKER_CLI ?= docker
PROMTOOL_CLI ?= promtool
PKGS = $(shell go list ./... | grep -v /vendor/ | grep -v /tests/e2e)
PKGS = $(shell go list ./... | grep -v /vendor/ | grep -v /tests/e2e | grep -v /tests/stable)
ARCH ?= $(shell go env GOARCH)
BUILD_DATE = $(shell date -u +'%Y-%m-%dT%H:%M:%SZ')
GIT_COMMIT ?= $(shell git rev-parse --short HEAD)
Expand Down
3 changes: 3 additions & 0 deletions tests/e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ mkdir -p ${KUBE_STATE_METRICS_LOG_DIR}
echo "access kube-state-metrics metrics endpoint"
curl -s "http://localhost:8001/api/v1/namespaces/kube-system/services/kube-state-metrics:http-metrics/proxy/metrics" >${KUBE_STATE_METRICS_LOG_DIR}/metrics

# check stable metrics aren't changed
go test ./tests/stable/check_test.go --collectedMetricsFile "$(realpath ${KUBE_STATE_METRICS_LOG_DIR}/metrics)" --stableMetricsFile "$(realpath tests/stable/testdata/stablemetrics.yaml)"

KUBE_STATE_METRICS_STATUS=$(curl -s "http://localhost:8001/api/v1/namespaces/kube-system/services/kube-state-metrics:http-metrics/proxy/healthz")
if [[ "${KUBE_STATE_METRICS_STATUS}" == "OK" ]]; then
echo "kube-state-metrics is still running after accessing metrics endpoint"
Expand Down
206 changes: 206 additions & 0 deletions tests/stable/check_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
/*
Copyright 2022 The Kubernetes Authors All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// To skip this test, put metric name into skippedStableMetrics.
package stable

import (
"flag"
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"testing"

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
prommodel "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"gopkg.in/yaml.v3"

"k8s.io/klog/v2"
)

var promText string
var stableYaml string

var skippedStableMetrics = []string{}

type Metric struct {
Name string `yaml:"name"`
Help string `yaml:"help"`
Type string `yaml:"type"`
Labels []string `yaml:"labels"`
// Histogram type
Buckets []float64 `yaml:"buckets,omitempty"`
}

func TestMain(m *testing.M) {
flag.StringVar(&promText, "collectedMetricsFile", "", "input prometheus metrics text file, text format")
flag.StringVar(&stableYaml, "stableMetricsFile", "", "expected stable metrics yaml file, yaml format")
flag.Parse()
m.Run()
}

func TestStableMetrics(t *testing.T) {
mf, err := parsePromText(promText)
if err != nil {
t.Fatalf("Can't parse collected prometheus metrics text. err = %v", err)
}
collectedStableMetrics := extractStableMetrics(mf)
printMetric(collectedStableMetrics)

expectedStableMetrics, err := readYaml(stableYaml)
if err != nil {
t.Fatalf("Can't read stable metrics from file. err = %v", err)
}

err = compare(collectedStableMetrics, *expectedStableMetrics, skippedStableMetrics)
if err != nil {
t.Fatalf("Stable metrics changed: err = %v", err)
} else {
klog.Infoln("Passed")
}
}

func compare(collectedStableMetrics []Metric, expectedStableMetrics []Metric, skippedStableMetrics []string) error {
skipMap := map[string]int{}
for _, v := range skippedStableMetrics {
skipMap[v] = 1
}
collected := convertToMap(collectedStableMetrics, skipMap)
expected := convertToMap(expectedStableMetrics, skipMap)

var ok bool

for _, name := range sortedKeys(expected) {
metric := expected[name]
var expectedMetric Metric
if expectedMetric, ok = collected[name]; !ok {
return fmt.Errorf("not found stable metric %s", name)
}
// Ingore Labels field due to ordering issue
if diff := cmp.Diff(metric, expectedMetric, cmpopts.IgnoreFields(Metric{}, "Help", "Labels")); diff != "" {
return fmt.Errorf("stable metric %s mismatch (-want +got):\n%s", name, diff)
}
// Compare Labels field after sorting
if diff := cmp.Diff(metric.Labels, expectedMetric.Labels, cmpopts.SortSlices(func(l1, l2 string) bool { return l1 > l2 })); diff != "" {
return fmt.Errorf("stable metric label %s mismatch (-want +got):\n%s", name, diff)
}
}
for _, name := range sortedKeys(collected) {
if _, ok = expected[name]; !ok {
return fmt.Errorf("detected new stable metric %s which isn't in testdata ", name)
}
}
return nil
}

func printMetric(metrics []Metric) {
yamlData, err := yaml.Marshal(metrics)
if err != nil {
klog.Errorf("error while Marshaling. %v", err)
}
klog.Infoln("---begin YAML file---")
klog.Infoln(string(yamlData))
klog.Infoln("---end YAML file---")
}

func parsePromText(path string) (map[string]*prommodel.MetricFamily, error) {
reader, err := os.Open(filepath.Clean(path))
if err != nil {
return nil, err
}
var parser expfmt.TextParser
mf, err := parser.TextToMetricFamilies(reader)
if err != nil {
return nil, err
}
return mf, nil
}

func getBuckets(v *prommodel.MetricFamily) []float64 {
buckets := []float64{}
if v.GetType() == prommodel.MetricType_HISTOGRAM {
for _, bucket := range v.Metric[0].GetHistogram().GetBucket() {
buckets = append(buckets, *bucket.UpperBound)
}
} else {
buckets = nil
}
return buckets
}

func getLabels(m *prommodel.Metric) []string {
labels := []string{}
for _, y := range m.Label {
labels = append(labels, y.GetName())
}
return labels
}

func extractStableMetrics(mf map[string]*prommodel.MetricFamily) []Metric {
metrics := []Metric{}
for _, v := range mf {
// Find stable metrics
if !strings.Contains(*(v.Help), "[STABLE]") {
continue
}
metrics = append(metrics, Metric{
Name: *(v.Name),
Help: *(v.Help),
Type: (v.Type).String(),
Buckets: getBuckets(v),
Labels: getLabels(v.Metric[0]),
})
}
return metrics
}

func readYaml(filename string) (*[]Metric, error) {
buf, err := os.ReadFile(filepath.Clean(filename))
if err != nil {
return nil, err
}
c := &[]Metric{}
err = yaml.Unmarshal(buf, c)
if err != nil {
return nil, fmt.Errorf("error %q: %w", filename, err)
}
return c, err
}

func convertToMap(metrics []Metric, skipMap map[string]int) map[string]Metric {
name2Metric := map[string]Metric{}
CatherineF-dev marked this conversation as resolved.
Show resolved Hide resolved
for _, v := range metrics {
if _, ok := skipMap[v.Name]; ok {
klog.Infof("skip, metric %s is in skip list\n", v.Name)
continue
}
name2Metric[v.Name] = v
}
return name2Metric
}

func sortedKeys(m map[string]Metric) []string {
keys := []string{}
for name := range m {
keys = append(keys, name)
}
sort.Strings(keys)
return keys
}
23 changes: 23 additions & 0 deletions tests/stable/testdata/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
These stable metrics aren't covered in stablemetrics.yaml, because they aren't
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add more documentation on how to run this stuff?

Copy link
Member

@dgrisonnet dgrisonnet Jan 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit problematic since ideally we would want a system that verifies that none of the stable metrics change.

Another problem we might encounter is that some labels will not verified because they are not present in the e2e tests. I don't have any example of stable metrics like that in mind, but we have for experimental ones: https://github.com/kubernetes/kube-state-metrics/blob/main/internal/store/endpointslice.go#L77-L148.
If we were to verify that metric today in the e2e tests, we would have to make sure that the endpointslice object is complete which is very prone to errors.

I recall we discussed implementing a static analysis solution to make sure that we were going over all the metrics and labels, was that not possible in the end?

Copy link
Contributor Author

@CatherineF-dev CatherineF-dev Jan 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a static analysis solution

It's hard to use static analysis for current kube-state-metrics repo, because labels are too dynamic.
For examples,

  1. It useslabelKeys = append(labelKeys, "ready") in kube_endpointslice_endpoints.
  2. Labels are from another function.

One possible way to make static analysis easier is to require defining all possible labels first.

const labels []string = {"label1", "label2", ...} // const. So that it can't be appended later.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. There are two things that are difficult to handle in that example:
    1.1 Having an intermediate labelKeys variable mean that we can't simply have the tool look for the labelKeys field assignation but might need to also build a constant version of the variable assigned to the field. An idea could be to have the tool first look for the labelKeys field assignation and in case the RValue is a variable and not a constant, iterate over the code again to build a const version of that variable.
    1.2 append should be the only callExp that we will ever see so having a case in the tool just for it would make sense to me. The idea would to replace append(labelKeys, "ready") but building a temporary slice and putting the "ready" string inside of it to convert the variable array to a constant one for our static analysis.
  2. We won't have to handle this scenario because it only affects _labels and _annotations metrics which have variable labels by design and as such shouldn't have any stability guarantees.

One possible way to make static analysis easier is to require defining all possible labels first.

This could make sense but I am afraid that this might make label values assignation error-prone since we would have to use the indexes instead of just appending the new labels.

The only option I see today in order to catch all the labels of all the metrics is via static analysis and if we don't want to make that intrusive to the developers to avoid errors, we will need to make the tool intelligent.

For the static analysis I am thinking that the following could be viable:

  1. Create an AST of each store and iterate over every generator code
  2. Find the labelKeys field assignation
  3. If the RValue isn't constant, reiterate over the generators to build a constant version of the variable. That should be possible because even though the slices are dynamic, there values are all constant.

Writing this I noticed that there is another scenario that we would need to handle which is the one of default labels:

descEndpointSliceLabelsDefaultLabels = []string{"endpointslice"}

These should always be define at the top-level of each store as a constant slice so we should be able to easily add them to each metrics.

This is becoming quite complex so if you have any other suggestions I would gladly hear them but I don't think that we can avoid static analysis if we want to build consistent guarantees.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that we can avoid static analysis if we want to build consistent guarantees.

Agree

This could make sense but I am afraid that this might make label values assignation error-prone since we would have to use the indexes instead of just appending the new labels.

Could you provide an example for this case?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you provide an example for this case?

I think my reasoning was based on examples such as https://github.com/kubernetes/kube-state-metrics/blob/main/internal/store/endpointslice.go#L77-L148. In these cases order is super important since we build the list of label values one label at a time. I thought that if we were to define label keys upfront, future addition of labels might mess with existing ordering.

But thinking about it again we could transform the code into something like:

const labelKeys []string = {"ready", "serving", ...} 

ready := ""
if ep.Conditions.Ready != nil {
  ready = ep.Conditions.Ready
}

serving := ""
if ep.Conditions.Serving != nil {
  serving = ep.Conditions.Serving
}

m = append(m, &metric.Metric,
  LabelKeys:   labelKeys,
  LabelValues: []string{ready, serving},
  Value:       1,
})

This original reasoning for how it is coded in the example I shared above was to reduce the network footprint by removing empty labels. But it could be argued that it doesn't have much impact + we could do processing later on when we build the HTTP response.

So I think your idea of defining const label keys everywhere makes sense and would ease very much static analysis.

exposed in KSM in the test cluster:
- kube_cronjob_spec_starting_deadline_seconds
- kube_cronjob_status_last_schedule_time
- kube_ingress_tls
- kube_job_complete
- kube_job_failed
- kube_job_spec_active_deadline_seconds
- kube_job_status_completion_time
- kube_node_spec_taint
- kube_persistentvolume_claim_ref
- kube_pod_completion_time
- kube_pod_init_container_info
- kube_pod_init_container_status_ready
- kube_pod_init_container_status_restarts_total
- kube_pod_init_container_status_running
- kube_pod_init_container_status_terminated
- kube_pod_init_container_status_waiting
- kube_pod_spec_volumes_persistentvolumeclaims_info
- kube_pod_spec_volumes_persistentvolumeclaims_readonly
- kube_pod_status_unschedulable
- kube_service_spec_external_ip
- kube_service_status_load_balancer_ingress
Loading