-
Notifications
You must be signed in to change notification settings - Fork 542
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
introduce WatchListLatencyPrometheus measurement #2315
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
interval: 1m | ||
input_series: | ||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="1"} | ||
values: 0 0 0 0 0 0 1 991 991 991 991 | ||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="5"} | ||
values: 0 0 0 0 0 0 1 1001 1001 1001 1001 | ||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="+Inf"} | ||
values: 0 0 0 0 0 0 1 1001 1001 1001 1001 | ||
|
||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="1"} | ||
values: 0 0 0 0 0 0 1 991 991 991 991 | ||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="5"} | ||
values: 0 0 0 0 0 0 1 1001 1001 1001 1001 | ||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="+Inf"} | ||
values: 0 0 0 0 0 0 1 1001 1001 1001 1001 | ||
|
||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="1"} | ||
values: 0 0 0 0 0 0 1 991 991 991 991 | ||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="5"} | ||
values: 0 0 0 0 0 0 1 1001 1001 1001 1001 | ||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="+Inf"} | ||
values: 0 0 0 0 0 0 1 1001 1001 1001 1001 | ||
|
||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="1"} | ||
values: 0 0 0 0 0 0 1 701 701 701 701 701 | ||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="1.5"} | ||
values: 0 0 0 0 0 0 1 1001 1001 1001 1001 | ||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="+Inf"} | ||
values: 0 0 0 0 0 0 1 1001 1001 1001 1001 | ||
|
||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="5"} | ||
values: 0 0 0 0 0 0 1 901 901 901 901 901 | ||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="10"} | ||
values: 0 0 0 0 0 0 1 1001 1001 1001 1001 | ||
- series: apiserver_watch_list_duration_seconds{component="apiserver",group="storage.k8s.io",resource="pod",scope="namespace",version="v1",le="+Inf"} | ||
values: 0 0 0 0 0 0 1 1001 1001 1001 1001 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
{ | ||
"version": "v1", | ||
"dataItems": [ | ||
{ | ||
"data": { | ||
"Perc50": 713.980028, | ||
"Perc90": 1333.166666, | ||
"Perc99": 1483.316666 | ||
}, | ||
"unit": "ms", | ||
"labels": { | ||
"Group": "storage.k8s.io", | ||
"Resource": "pod", | ||
"Scope": "namespace", | ||
"Version": "v1" | ||
} | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
/* | ||
Copyright 2023 The Kubernetes Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package common | ||
|
||
import ( | ||
"fmt" | ||
"sort" | ||
"strconv" | ||
"time" | ||
|
||
"github.com/prometheus/common/model" | ||
|
||
"k8s.io/klog/v2" | ||
"k8s.io/perf-tests/clusterloader2/pkg/measurement" | ||
measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" | ||
"k8s.io/perf-tests/clusterloader2/pkg/util" | ||
) | ||
|
||
const ( | ||
watchListLatencyPrometheusMeasurementName = "WatchListLatencyPrometheus" | ||
|
||
// watchListLatencyQuery placeholders must be replaced with (1) quantile (2) query window size | ||
watchListLatencyQuery = "histogram_quantile(%.2f, sum(rate(apiserver_watch_list_duration_seconds{}[%v])) by (group, version, resource, scope, le))" | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we need "_bucket" at the end of metric name? I would really prefer consistency between those two. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually - I started to wonder more generically - why can't we just reuse that other measurement that I linked? I think we're effectively reimplementing the exact same logic and the only differences that we have are: I think it should be possible to slightly refactor that other measurement and simply register two measurements there: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The First of all, it has two different modes for getting the latency metrics (simple and extended). I think that the refactoring would boil down to creating "generic simple latency metrics," which could potentially be reused by both implementations. Given that the internal data structures differ, the existing implementation would have to incorporate the generic latency metric and extend it. Is this what you had in mind ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes you should use the buckets with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @p0lyn0mial - I'm a bit lost in your comment, so let me try to explain a bit deeper what I had in mind:
Once we prove that, we should actually merge the Samples for list & watchlist together, but let's do that as a follow-up and just start by treating them as separate things. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, I think I understand, the new metric (watchlist) will end up on being reported as part of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. created #2764 |
||
|
||
func init() { | ||
create := func() measurement.Measurement { return CreatePrometheusMeasurement(&watchListLatencyGatherer{}) } | ||
if err := measurement.Register(watchListLatencyPrometheusMeasurementName, create); err != nil { | ||
klog.Fatalf("Cannot register %s: %v", watchListLatencyPrometheusMeasurementName, err) | ||
} | ||
} | ||
|
||
type watchListLatencyMetric struct { | ||
wojtek-t marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Group string `json:"group"` | ||
Version string `json:"version"` | ||
Resource string `json:"resource"` | ||
Scope string `json:"scope"` | ||
Latency measurementutil.LatencyMetric `json:"latency"` | ||
} | ||
|
||
type watchListLatencyMetrics map[string]*watchListLatencyMetric | ||
|
||
func (m watchListLatencyMetrics) SetLatency(group, version, resource, scope string, quantile float64, latency time.Duration) { | ||
key := fmt.Sprintf("%s|%s|%s", group, resource, scope) | ||
entry, exists := m[key] | ||
if !exists { | ||
entry = &watchListLatencyMetric{ | ||
Group: group, | ||
Version: version, | ||
Resource: resource, | ||
Scope: scope, | ||
} | ||
m[key] = entry | ||
} | ||
entry.Latency.SetQuantile(quantile, latency) | ||
} | ||
|
||
// watchListLatencyGatherer gathers 50th, 90th and 99th duration quantiles | ||
// for watch list requests broken down by group, resource, scope. | ||
type watchListLatencyGatherer struct{} | ||
|
||
func (m *watchListLatencyGatherer) Gather(executor QueryExecutor, startTime, endTime time.Time, config *measurement.Config) ([]measurement.Summary, error) { | ||
rawWatchListMetrics, err := gatherWatchListLatencyPrometheusSamples(executor, startTime, endTime) | ||
if err != nil { | ||
return nil, err | ||
} | ||
watchListMetrics, err := convertWatchListPrometheusSamplesToWatchListLatencyMetrics(rawWatchListMetrics) | ||
if err != nil { | ||
return nil, err | ||
} | ||
watchListMetricsJSON, err := util.PrettyPrintJSON(convertWatchListLatencyMetricsToPerfData(watchListMetrics)) | ||
if err != nil { | ||
return nil, err | ||
} | ||
summaryName, err := util.GetStringOrDefault(config.Params, "summaryName", m.String()) | ||
if err != nil { | ||
return nil, err | ||
} | ||
summaries := []measurement.Summary{ | ||
measurement.CreateSummary(summaryName, "json", watchListMetricsJSON), | ||
} | ||
return summaries, nil | ||
} | ||
|
||
func (m *watchListLatencyGatherer) Configure(_ *measurement.Config) error { return nil } | ||
func (m *watchListLatencyGatherer) IsEnabled(_ *measurement.Config) bool { return true } | ||
func (m *watchListLatencyGatherer) String() string { return watchListLatencyPrometheusMeasurementName } | ||
|
||
func gatherWatchListLatencyPrometheusSamples(executor QueryExecutor, startTime, endTime time.Time) ([]*model.Sample, error) { | ||
var latencySamples []*model.Sample | ||
// since we collect LatencyMetric only 0.5, 0.9 and 0.99 quantiles are supported | ||
quantiles := []float64{0.5, 0.9, 0.99} | ||
measurementDuration := endTime.Sub(startTime) | ||
promDuration := measurementutil.ToPrometheusTime(measurementDuration) | ||
|
||
for _, q := range quantiles { | ||
query := fmt.Sprintf(watchListLatencyQuery, q, promDuration) | ||
samples, err := executor.Query(query, endTime) | ||
if err != nil { | ||
return nil, err | ||
} | ||
for _, sample := range samples { | ||
sample.Metric["quantile"] = model.LabelValue(fmt.Sprintf("%.2f", q)) | ||
} | ||
latencySamples = append(latencySamples, samples...) | ||
} | ||
|
||
return latencySamples, nil | ||
} | ||
|
||
func convertWatchListPrometheusSamplesToWatchListLatencyMetrics(latencySamples []*model.Sample) (watchListLatencyMetrics, error) { | ||
latencyMetrics := make(watchListLatencyMetrics) | ||
extractLabels := func(sample *model.Sample) (string, string, string, string) { | ||
return string(sample.Metric["group"]), string(sample.Metric["version"]), string(sample.Metric["resource"]), string(sample.Metric["scope"]) | ||
} | ||
|
||
for _, sample := range latencySamples { | ||
group, version, resource, scope := extractLabels(sample) | ||
quantile, err := strconv.ParseFloat(string(sample.Metric["quantile"]), 64) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
latency := time.Duration(float64(sample.Value) * float64(time.Second)) | ||
latencyMetrics.SetLatency(group, version, resource, scope, quantile, latency) | ||
} | ||
|
||
return latencyMetrics, nil | ||
} | ||
|
||
func convertWatchListLatencyMetricsToPerfData(watchListMetrics watchListLatencyMetrics) *measurementutil.PerfData { | ||
var watchListMetricsSlice []*watchListLatencyMetric | ||
for _, v := range watchListMetrics { | ||
watchListMetricsSlice = append(watchListMetricsSlice, v) | ||
} | ||
sort.Slice(watchListMetricsSlice, func(i, j int) bool { | ||
return watchListMetricsSlice[i].Latency.Perc99 > watchListMetricsSlice[j].Latency.Perc99 | ||
}) | ||
|
||
perfData := &measurementutil.PerfData{Version: "v1"} | ||
for _, watchListMetric := range watchListMetricsSlice { | ||
item := measurementutil.DataItem{ | ||
Data: map[string]float64{ | ||
"Perc50": float64(watchListMetric.Latency.Perc50) / 1000000, | ||
"Perc90": float64(watchListMetric.Latency.Perc90) / 1000000, | ||
"Perc99": float64(watchListMetric.Latency.Perc99) / 1000000, | ||
}, | ||
Unit: "ms", | ||
Labels: map[string]string{ | ||
"Group": watchListMetric.Group, | ||
"Version": watchListMetric.Version, | ||
"Resource": watchListMetric.Resource, | ||
"Scope": watchListMetric.Scope, | ||
}, | ||
} | ||
perfData.DataItems = append(perfData.DataItems, item) | ||
} | ||
return perfData | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
/* | ||
Copyright 2023 The Kubernetes Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package common | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
"testing" | ||
"time" | ||
|
||
"github.com/google/go-cmp/cmp" | ||
|
||
"k8s.io/perf-tests/clusterloader2/pkg/measurement" | ||
"k8s.io/perf-tests/clusterloader2/pkg/measurement/common/executors" | ||
) | ||
|
||
func TestWatchListLatencyGather(t *testing.T) { | ||
scenarios := []struct { | ||
name string | ||
inputFileName string | ||
|
||
duration time.Duration | ||
}{ | ||
{ | ||
name: "smoke test: make sure the output matches the static golden file", | ||
inputFileName: "sample.yaml", | ||
duration: 10 * time.Minute, | ||
}, | ||
} | ||
|
||
for _, scenario := range scenarios { | ||
t.Run(scenario.name, func(t *testing.T) { | ||
inputFilePath := fmt.Sprintf("testdata/watch_list_latency_prometheus/%s", scenario.inputFileName) | ||
executor, err := executors.NewPromqlExecutor(inputFilePath) | ||
if err != nil { | ||
t.Fatalf("failed to create PromQL executor: %v", err) | ||
} | ||
defer executor.Close() | ||
|
||
emptyConfig := &measurement.Config{Params: map[string]interface{}{}} | ||
target := &watchListLatencyGatherer{} | ||
start := time.Unix(0, 0).UTC() | ||
end := start.Add(scenario.duration) | ||
output, err := target.Gather(executor, start, end, emptyConfig) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
if len(output) != 1 { | ||
t.Fatalf("expected only one summary, got: %d", len(output)) | ||
} | ||
|
||
rawGoldenFile, err := os.ReadFile(inputFilePath + ".golden") | ||
if err != nil { | ||
t.Fatalf("unable to read the golden file, err: %v", err) | ||
} | ||
if diff := cmp.Diff(string(rawGoldenFile), output[0].SummaryContent()); diff != "" { | ||
t.Errorf("unexpected output (-want +got):\n%s", diff) | ||
} | ||
// for simplicity, you can uncomment the following line to | ||
// generate a new golden file for a failed test case. | ||
// | ||
//os.WriteFile(inputFilePath+".golden", []byte(output[0].SummaryContent()), 0644) | ||
}) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For consistency with https://github.com/kubernetes/perf-tests/blob/master/clusterloader2/pkg/measurement/common/slos/api_responsiveness_prometheus.go
can you suffix it with Simple?
basically this is simplified version of the slo and we should reflect it.