Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use constrained topk to improve dashboard performance #2825

Merged
merged 2 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 59 additions & 18 deletions cmd/tools/grafana/dashboard_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -807,12 +807,12 @@ func checkUniquePanelIDs(t *testing.T, path string, data []byte) {
})
}

// - collect all expressions that include "topk". Ignore expressions that are:
// - Collect all expressions and variables that include "topk".
// Ignore expressions that are:
// - part of a table or stat or
// - calculate a percentage
// - for each expression - check if any variable used in the expression has a topk range
// a) if it does, pass
// b) otherwise fail, printing the expression, path, dashboard
// - if the var|expression includes a `rate|deriv`, ensure the look-back is 4m
// - otherwise, the look-back should be 3h

func TestTopKRange(t *testing.T) {
VisitDashboards(
Expand Down Expand Up @@ -846,24 +846,27 @@ func checkTopKRange(t *testing.T, path string, data []byte) {
if strings.Contains(expr.expr, "/") {
continue
}
hasRange := false
vars:

for _, name := range expr.vars {
for _, v := range variables {
if v.name == name && strings.Contains(v.query, "__range") {
hasRange = true
break vars
}
v, ok := variables[name]
if !ok {
t.Errorf(`dashboard=%s path=%s is using var that does not exist. var=%s`,
ShortPath(path), expr.path, name)
continue
}
if !strings.Contains(v.query, "topk") {
continue
}
}

noWhitespace := strings.ReplaceAll(expr.expr, " ", "")
if strings.Contains(noWhitespace, "[$__range]@end()") {
hasRange = true
problem := ensureLookBack(v.query)
if problem != "" {
t.Errorf(`dashboard=%s var=%s topk got=%s %s`, ShortPath(path), v.name, v.query, problem)
}
}
if !hasRange {
t.Errorf(`dashboard=%s path=%s use topk but no variable has range. expr=%s`,
ShortPath(path), expr.path, expr.expr)

problem := ensureLookBack(expr.expr)
if problem != "" {
t.Errorf(`dashboard=%s path=%s topk got=%s %s`, ShortPath(path), expr.path, expr.expr, problem)
}
}

Expand Down Expand Up @@ -907,6 +910,44 @@ func checkTopKRange(t *testing.T, path string, data []byte) {

}

var lookBackRe = regexp.MustCompile(`\[(.*?)]`)

// ensureLookBack ensures that the look-back for a topk query is either 4m or 3h.
// If the query contains a rate or deriv function, the look-back should be 4m
// otherwise, the look-back should be 3h.
// If the look-back is incorrect, the function returns a string describing the correct look-back
func ensureLookBack(text string) string {
if !strings.Contains(text, "[") {
return ""
}
// search for the first look-back
matches := lookBackRe.FindAllStringSubmatch(text, -1)
indexes := lookBackRe.FindAllStringIndex(text, -1)

for i, match := range matches {
indexOfLookBack := indexes[i][1]

// search backwards for the function
openIndex := strings.LastIndex(text[:indexOfLookBack], "(")
space := strings.LastIndex(text[:openIndex], " ")
if space == -1 {
space = 0
}
function := text[space:openIndex]

if strings.Contains(function, "rate") || strings.Contains(function, "deriv") {
if match[1] != "4m" {
return "rate/deriv want=[4m]"
}
} else if match[1] != "3h" {
return "range lookback want=[3h]"
}

}

return ""
}

func TestOnlyHighlightsExpanded(t *testing.T) {
exceptions := map[string]int{
"cmode/shelf.json": 2,
Expand Down
9 changes: 5 additions & 4 deletions cmd/tools/grafana/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,22 +105,23 @@ type variable struct {
options []gjson.Result
}

func allVariables(data []byte) []variable {
variables := make([]variable, 0)
func allVariables(data []byte) map[string]variable {
variables := make(map[string]variable)
gjson.GetBytes(data, "templating.list").ForEach(func(key, value gjson.Result) bool {
// The datasource variable can be ignored
if value.Get("type").String() == "datasource" {
return true
}

variables = append(variables, variable{
v := variable{
name: value.Get("name").String(),
kind: value.Get("type").String(),
query: value.Get("query.query").String(),
refresh: value.Get("refresh").String(),
options: value.Get("options").Array(),
path: key.String(),
})
}
variables[v.name] = v
return true
})
return variables
Expand Down
40 changes: 20 additions & 20 deletions grafana/dashboards/cmode/aggregate.json

Large diffs are not rendered by default.

22 changes: 11 additions & 11 deletions grafana/dashboards/cmode/cdot.json
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@
"targets": [
{
"exemplar": false,
"expr": "sum by (cluster) (node_cifs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(node_cifs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "sum by (cluster) (node_cifs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(node_cifs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"interval": "",
"legendFormat": "{{cluster}}",
"refId": "A"
Expand Down Expand Up @@ -260,7 +260,7 @@
"targets": [
{
"exemplar": false,
"expr": "sum by (cluster) (node_nfs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(node_nfs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "sum by (cluster) (node_nfs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(node_nfs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"interval": "",
"legendFormat": "{{cluster}}",
"refId": "A"
Expand Down Expand Up @@ -351,7 +351,7 @@
"targets": [
{
"exemplar": false,
"expr": "sum by (cluster) (volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "sum by (cluster) (volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"interval": "",
"legendFormat": "{{cluster}}",
"refId": "A"
Expand Down Expand Up @@ -445,7 +445,7 @@
"targets": [
{
"exemplar": false,
"expr": "avg by (cluster) (node_avg_processor_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_avg_processor_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "avg by (cluster) (node_avg_processor_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_avg_processor_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"format": "time_series",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -540,7 +540,7 @@
"targets": [
{
"exemplar": false,
"expr": "avg by (cluster) (node_cpu_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_cpu_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "avg by (cluster) (node_cpu_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_cpu_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"format": "time_series",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -633,7 +633,7 @@
"targets": [
{
"exemplar": false,
"expr": "avg by (cluster) (node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "avg by (cluster) (node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"interval": "",
"legendFormat": "{{cluster}}",
"refId": "A"
Expand Down Expand Up @@ -1432,7 +1432,7 @@
"targets": [
{
"exemplar": false,
"expr": "svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}[$__range] @ end()))",
"expr": "svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}[3h] @ end()))",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}}",
"refId": "A"
Expand Down Expand Up @@ -1611,7 +1611,7 @@
"targets": [
{
"exemplar": false,
"expr": "svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}[$__range] @ end()))",
"expr": "svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}[3h] @ end()))",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}}",
"refId": "A"
Expand Down Expand Up @@ -1716,7 +1716,7 @@
{
"datasource": "${DS_PROMETHEUS}",
"exemplar": false,
"expr": "volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[$__range] @ end()))",
"expr": "volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[3h] @ end()))",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}} - {{volume}}",
"refId": "A"
Expand Down Expand Up @@ -1807,7 +1807,7 @@
{
"datasource": "${DS_PROMETHEUS}",
"exemplar": false,
"expr": "volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[$__range] @ end())) ",
"expr": "volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[3h] @ end())) ",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}} - {{volume}}",
"refId": "A"
Expand Down Expand Up @@ -1899,7 +1899,7 @@
{
"datasource": "${DS_PROMETHEUS}",
"exemplar": false,
"expr": "volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[$__range] @ end()))",
"expr": "volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[3h] @ end()))",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}} - {{volume}}",
"refId": "A"
Expand Down
20 changes: 10 additions & 10 deletions grafana/dashboards/cmode/cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -4041,7 +4041,7 @@
"targets": [
{
"exemplar": false,
"expr": "svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end()))",
"expr": "svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end()))",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}}",
"refId": "A"
Expand Down Expand Up @@ -4216,7 +4216,7 @@
"targets": [
{
"exemplar": false,
"expr": "svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end()))",
"expr": "svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end()))",
"format": "time_series",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -4410,7 +4410,7 @@
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "query_result(topk($TopResources, avg_over_time(svm_vol_read_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[${__range}])+avg_over_time(svm_vol_write_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[${__range}])))",
"definition": "query_result(topk($TopResources, avg_over_time(svm_vol_read_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h])+avg_over_time(svm_vol_write_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h])))",
"description": null,
"error": null,
"hide": 2,
Expand All @@ -4420,7 +4420,7 @@
"name": "TopSVMs",
"options": [],
"query": {
"query": "query_result(topk($TopResources, avg_over_time(svm_vol_read_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[${__range}])+avg_over_time(svm_vol_write_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[${__range}])))",
"query": "query_result(topk($TopResources, avg_over_time(svm_vol_read_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h])+avg_over_time(svm_vol_write_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h])))",
"refId": "StandardVariableQuery"
},
"refresh": 2,
Expand All @@ -4433,7 +4433,7 @@
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[${__range}]))))",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[3h]))))",
"description": null,
"error": null,
"hide": 2,
Expand All @@ -4443,7 +4443,7 @@
"name": "TopVolumeAvgLatency",
"options": [],
"query": {
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[${__range}]))))",
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[3h]))))",
"refId": "StandardVariableQuery"
},
"refresh": 2,
Expand All @@ -4456,7 +4456,7 @@
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[${__range}]))))",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[3h]))))",
"description": null,
"error": null,
"hide": 2,
Expand All @@ -4466,7 +4466,7 @@
"name": "TopVolumeTotalData",
"options": [],
"query": {
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[${__range}]))))",
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[3h]))))",
"refId": "StandardVariableQuery"
},
"refresh": 2,
Expand All @@ -4479,7 +4479,7 @@
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",node!=\"\"}[${__range}]))))",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",node!=\"\"}[3h]))))",
"description": null,
"error": null,
"hide": 2,
Expand All @@ -4489,7 +4489,7 @@
"name": "TopVolumeTotalOps",
"options": [],
"query": {
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",node!=\"\"}[${__range}]))))",
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",node!=\"\"}[3h]))))",
"refId": "StandardVariableQuery"
},
"refresh": 2,
Expand Down
Loading