Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

introduce WatchListLatencyPrometheus measurement #2315

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
interval: 1m
input_series:
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="1"}
values: 0 0 0 0 0 0 1 991 991 991 991
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="5"}
values: 0 0 0 0 0 0 1 1001 1001 1001 1001
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="+Inf"}
values: 0 0 0 0 0 0 1 1001 1001 1001 1001

- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="1"}
values: 0 0 0 0 0 0 1 991 991 991 991
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="5"}
values: 0 0 0 0 0 0 1 1001 1001 1001 1001
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="+Inf"}
values: 0 0 0 0 0 0 1 1001 1001 1001 1001

- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="1"}
values: 0 0 0 0 0 0 1 991 991 991 991
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="5"}
values: 0 0 0 0 0 0 1 1001 1001 1001 1001
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="+Inf"}
values: 0 0 0 0 0 0 1 1001 1001 1001 1001

- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="1"}
values: 0 0 0 0 0 0 1 701 701 701 701 701
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="1.5"}
values: 0 0 0 0 0 0 1 1001 1001 1001 1001
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="+Inf"}
values: 0 0 0 0 0 0 1 1001 1001 1001 1001

- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="5"}
values: 0 0 0 0 0 0 1 901 901 901 901 901
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="10"}
values: 0 0 0 0 0 0 1 1001 1001 1001 1001
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="+Inf"}
values: 0 0 0 0 0 0 1 1001 1001 1001 1001
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"version": "v1",
"dataItems": [
{
"data": {
"Perc50": 713.980028,
"Perc90": 1333.166666,
"Perc99": 1483.316666
},
"unit": "ms",
"labels": {
"Group": "storage.k8s.io",
"Resource": "pod",
"Scope": "namespace",
"Version": "v1"
}
}
]
}
173 changes: 173 additions & 0 deletions clusterloader2/pkg/measurement/common/watch_list_latency_prometheus.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
/*
Copyright 2023 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package common

import (
"fmt"
"sort"
"strconv"
"time"

"github.com/prometheus/common/model"

"k8s.io/klog/v2"
"k8s.io/perf-tests/clusterloader2/pkg/measurement"
measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
"k8s.io/perf-tests/clusterloader2/pkg/util"
)

const (
watchListLatencyPrometheusMeasurementName = "WatchListLatencyPrometheus"

// watchListLatencyQuery placeholders must be replaced with (1) quantile (2) query window size
watchListLatencyQuery = "histogram_quantile(%.2f, sum(rate(apiserver_watch_list_duration_seconds{}[%v])) by (group, version, resource, scope, le))"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For consistency with https://github.com/kubernetes/perf-tests/blob/master/clusterloader2/pkg/measurement/common/slos/api_responsiveness_prometheus.go

can you suffix it with Simple?

basically this is simplified version of the slo and we should reflect it.

)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need "_bucket" at the end of metric name?
We're using it here:
https://github.com/kubernetes/perf-tests/blob/master/clusterloader2/pkg/measurement/common/slos/api_responsiveness_prometheus.go#L58

I would really prefer consistency between those two.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually - I started to wonder more generically - why can't we just reuse that other measurement that I linked?

I think we're effectively reimplementing the exact same logic and the only differences that we have are:
(1) we're using a different metric name
(2) the verb is always LIST

I think it should be possible to slightly refactor that other measurement and simply register two measurements there:
https://github.com/kubernetes/perf-tests/blob/master/clusterloader2/pkg/measurement/common/slos/api_responsiveness_prometheus.go#L81C3-L82C1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ApiResponsivenessGatherer differs in a few places.

First of all, it has two different modes for getting the latency metrics (simple and extended).
In addition to gathering the latency, it collects two additional metrics: count and countFast.
The internal data structures hold all three metrics.
Once the metrics are collected, it supports reading a custom threshold from the config, which is used for further validation.

I think that the refactoring would boil down to creating "generic simple latency metrics," which could potentially be reused by both implementations.

Given that the internal data structures differ, the existing implementation would have to incorporate the generic latency metric and extend it.

Is this what you had in mind ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need "_bucket" at the end of metric name?

Yes you should use the buckets with histogram_quantile.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@p0lyn0mial - I'm a bit lost in your comment, so let me try to explain a bit deeper what I had in mind:

  1. yes, there are two modes (simple and "normal", but the difference between these two is only how we're sampling the metrics. To be more specific, this is the only difference between these two modes:
    https://github.com/kubernetes/perf-tests/blob/master/clusterloader2/pkg/measurement/common/slos/api_responsiveness_prometheus.go#L176-L204

  2. From e2e user perspective, if I want to list my objects, it doesn't really matter if the server underneath is using the list method or the new watchlist protocol. I care about the latency of getting the result

  3. Because of (2), we generally don't want to introduce a separate measurement, it should actually be part of exactly the same measurement (same config, same threshold, ....)
    Although, initially we may want to split that a bit for debuggability reasons.

  4. So the way I think about what we should do is effectively:

Once we prove that, we should actually merge the Samples for list & watchlist together, but let's do that as a follow-up and just start by treating them as separate things.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I think I understand, the new metric (watchlist) will end up on being reported as part of LoadResponsiveness_PrometheusSimple for all jobs! I like it. Thanks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

created #2764


func init() {
create := func() measurement.Measurement { return CreatePrometheusMeasurement(&watchListLatencyGatherer{}) }
if err := measurement.Register(watchListLatencyPrometheusMeasurementName, create); err != nil {
klog.Fatalf("Cannot register %s: %v", watchListLatencyPrometheusMeasurementName, err)
}
}

type watchListLatencyMetric struct {
wojtek-t marked this conversation as resolved.
Show resolved Hide resolved
Group string `json:"group"`
Version string `json:"version"`
Resource string `json:"resource"`
Scope string `json:"scope"`
Latency measurementutil.LatencyMetric `json:"latency"`
}

type watchListLatencyMetrics map[string]*watchListLatencyMetric

func (m watchListLatencyMetrics) SetLatency(group, version, resource, scope string, quantile float64, latency time.Duration) {
key := fmt.Sprintf("%s|%s|%s", group, resource, scope)
entry, exists := m[key]
if !exists {
entry = &watchListLatencyMetric{
Group: group,
Version: version,
Resource: resource,
Scope: scope,
}
m[key] = entry
}
entry.Latency.SetQuantile(quantile, latency)
}

// watchListLatencyGatherer gathers 50th, 90th and 99th duration quantiles
// for watch list requests broken down by group, resource, scope.
type watchListLatencyGatherer struct{}

func (m *watchListLatencyGatherer) Gather(executor QueryExecutor, startTime, endTime time.Time, config *measurement.Config) ([]measurement.Summary, error) {
rawWatchListMetrics, err := gatherWatchListLatencyPrometheusSamples(executor, startTime, endTime)
if err != nil {
return nil, err
}
watchListMetrics, err := convertWatchListPrometheusSamplesToWatchListLatencyMetrics(rawWatchListMetrics)
if err != nil {
return nil, err
}
watchListMetricsJSON, err := util.PrettyPrintJSON(convertWatchListLatencyMetricsToPerfData(watchListMetrics))
if err != nil {
return nil, err
}
summaryName, err := util.GetStringOrDefault(config.Params, "summaryName", m.String())
if err != nil {
return nil, err
}
summaries := []measurement.Summary{
measurement.CreateSummary(summaryName, "json", watchListMetricsJSON),
}
return summaries, nil
}

func (m *watchListLatencyGatherer) Configure(_ *measurement.Config) error { return nil }
func (m *watchListLatencyGatherer) IsEnabled(_ *measurement.Config) bool { return true }
func (m *watchListLatencyGatherer) String() string { return watchListLatencyPrometheusMeasurementName }

func gatherWatchListLatencyPrometheusSamples(executor QueryExecutor, startTime, endTime time.Time) ([]*model.Sample, error) {
var latencySamples []*model.Sample
// since we collect LatencyMetric only 0.5, 0.9 and 0.99 quantiles are supported
quantiles := []float64{0.5, 0.9, 0.99}
measurementDuration := endTime.Sub(startTime)
promDuration := measurementutil.ToPrometheusTime(measurementDuration)

for _, q := range quantiles {
query := fmt.Sprintf(watchListLatencyQuery, q, promDuration)
samples, err := executor.Query(query, endTime)
if err != nil {
return nil, err
}
for _, sample := range samples {
sample.Metric["quantile"] = model.LabelValue(fmt.Sprintf("%.2f", q))
}
latencySamples = append(latencySamples, samples...)
}

return latencySamples, nil
}

func convertWatchListPrometheusSamplesToWatchListLatencyMetrics(latencySamples []*model.Sample) (watchListLatencyMetrics, error) {
latencyMetrics := make(watchListLatencyMetrics)
extractLabels := func(sample *model.Sample) (string, string, string, string) {
return string(sample.Metric["group"]), string(sample.Metric["version"]), string(sample.Metric["resource"]), string(sample.Metric["scope"])
}

for _, sample := range latencySamples {
group, version, resource, scope := extractLabels(sample)
quantile, err := strconv.ParseFloat(string(sample.Metric["quantile"]), 64)
if err != nil {
return nil, err
}

latency := time.Duration(float64(sample.Value) * float64(time.Second))
latencyMetrics.SetLatency(group, version, resource, scope, quantile, latency)
}

return latencyMetrics, nil
}

func convertWatchListLatencyMetricsToPerfData(watchListMetrics watchListLatencyMetrics) *measurementutil.PerfData {
var watchListMetricsSlice []*watchListLatencyMetric
for _, v := range watchListMetrics {
watchListMetricsSlice = append(watchListMetricsSlice, v)
}
sort.Slice(watchListMetricsSlice, func(i, j int) bool {
return watchListMetricsSlice[i].Latency.Perc99 > watchListMetricsSlice[j].Latency.Perc99
})

perfData := &measurementutil.PerfData{Version: "v1"}
for _, watchListMetric := range watchListMetricsSlice {
item := measurementutil.DataItem{
Data: map[string]float64{
"Perc50": float64(watchListMetric.Latency.Perc50) / 1000000,
"Perc90": float64(watchListMetric.Latency.Perc90) / 1000000,
"Perc99": float64(watchListMetric.Latency.Perc99) / 1000000,
},
Unit: "ms",
Labels: map[string]string{
"Group": watchListMetric.Group,
"Version": watchListMetric.Version,
"Resource": watchListMetric.Resource,
"Scope": watchListMetric.Scope,
},
}
perfData.DataItems = append(perfData.DataItems, item)
}
return perfData
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
Copyright 2023 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package common

import (
"fmt"
"os"
"testing"
"time"

"github.com/google/go-cmp/cmp"

"k8s.io/perf-tests/clusterloader2/pkg/measurement"
"k8s.io/perf-tests/clusterloader2/pkg/measurement/common/executors"
)

func TestWatchListLatencyGather(t *testing.T) {
scenarios := []struct {
name string
inputFileName string

duration time.Duration
}{
{
name: "smoke test: make sure the output matches the static golden file",
inputFileName: "sample.yaml",
duration: 10 * time.Minute,
},
}

for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
inputFilePath := fmt.Sprintf("testdata/watch_list_latency_prometheus/%s", scenario.inputFileName)
executor, err := executors.NewPromqlExecutor(inputFilePath)
if err != nil {
t.Fatalf("failed to create PromQL executor: %v", err)
}
defer executor.Close()

emptyConfig := &measurement.Config{Params: map[string]interface{}{}}
target := &watchListLatencyGatherer{}
start := time.Unix(0, 0).UTC()
end := start.Add(scenario.duration)
output, err := target.Gather(executor, start, end, emptyConfig)
if err != nil {
t.Fatal(err)
}
if len(output) != 1 {
t.Fatalf("expected only one summary, got: %d", len(output))
}

rawGoldenFile, err := os.ReadFile(inputFilePath + ".golden")
if err != nil {
t.Fatalf("unable to read the golden file, err: %v", err)
}
if diff := cmp.Diff(string(rawGoldenFile), output[0].SummaryContent()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
// for simplicity, you can uncomment the following line to
// generate a new golden file for a failed test case.
//
//os.WriteFile(inputFilePath+".golden", []byte(output[0].SummaryContent()), 0644)
})
}
}