Skip to content

Commit

Permalink
Merge #99860 #100550
Browse files Browse the repository at this point in the history
99860: tsdb: add tenant-level store metrics to tsdb r=aadityasondhi a=abarganier

Previously, while we had the ability to show tenant-level store metrics in the `/_status/vars` page, these metrics were never written to tsdb.

This is despite the changes in #98077, which did a great job at writing all the metrics in the tenant-specific metric registries, but didn't pull the tenant-specific store metrics out of the store registries. This is because these metrics exist as *child* metrics on the store registry metrics, and we did not previously have logic to individually pick these metrics out of their parent AggGauge/Counter metrics.

This patch adds the logic to do so. Now, for each tenant ID that exists in the recorder's `tenantRegistries` map, we will attempt to pick that tenant's individual child metric & values from all the metrics that exist in TenantsStorageMetrics. This will limit the writing of these tenant-level metrics to only happen in deployments where multiple tenants are running in-process, as environments such as serverless clusters are expected to have an empty `tenantRegistries` map in this recorder.

This is necessary because if we're going to support multi-tenant tsdb, app tenants should be able to see core storage information about their logical cluster, such as `livebytes` which indicates how much live active data exists for the cluster.

Release note: none

Fixes: #99228

100550: sql: adjust a couple of memory monitoring tests r=yuzefovich a=yuzefovich

This commit adjusts a couple of memory monitoring related tests.

`TestAggregatesMonitorMemory` has been rewritten to observe the correct
memory monitor via `crdb_internal.node_memory_monitors` virtual table.

`TestEvaluatedMemoryIsChecked` is just straight up removed. Initially,
this test was expected to verify that builtin functions like `repeat`
perform memory accounting of the intermediate result via our memory
accounting system. However, that changed long time ago in 2b00f15
and now such builtins estimate their result size and return
`errStringTooLarge` error, so the test was no longer verifying what it
intended. This commit removes this test since we do verify the behavior
introduced in 2b00f15 elsewhere (in the
logic tests).

Fixes: #79014.
Fixes: #100119.

Release note: None

Co-authored-by: Alex Barganier <abarganier@crlMBP-NR362FWT1GODM0.local>
Co-authored-by: Yahor Yuzefovich <yahor@cockroachlabs.com>
  • Loading branch information
3 people committed Apr 4, 2023
3 parents ef357d1 + 4dd0304 + 7d04684 commit 0d15e0d
Show file tree
Hide file tree
Showing 16 changed files with 412 additions and 98 deletions.
5 changes: 4 additions & 1 deletion pkg/kv/kvbase/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")

go_library(
name = "kvbase",
srcs = ["constants.go"],
srcs = [
"constants.go",
"metrics.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvbase",
visibility = ["//visibility:public"],
)
Expand Down
17 changes: 17 additions & 0 deletions pkg/kv/kvbase/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package kvbase

// TenantsStorageMetricsSet is the set of all metric names contained
// within TenantsStorageMetrics, recorded at the individual tenant level.
//
// Made available in kvbase to help avoid import cycles.
var TenantsStorageMetricsSet map[string]struct{}
36 changes: 36 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"time"
"unsafe"

"github.com/cockroachdb/cockroach/pkg/kv/kvbase"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/allocatorimpl"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
Expand All @@ -37,6 +38,12 @@ import (
"go.etcd.io/raft/v3/raftpb"
)

func init() {
// Inject the TenantStorageMetricsSet available in the kvbase pkg to
// avoid import cycles.
kvbase.TenantsStorageMetricsSet = tenantsStorageMetricsSet()
}

var (
// Replica metrics.
metaReplicaCount = metric.Metadata{
Expand Down Expand Up @@ -2197,6 +2204,8 @@ func (ref *tenantMetricsRef) assert(ctx context.Context) {
// call acquire and release to properly reference count the metrics for
// individual tenants.
type TenantsStorageMetrics struct {
// NB: If adding more metrics to this struct, be sure to
// also update tenantsStorageMetricsSet().
LiveBytes *aggmetric.AggGauge
KeyBytes *aggmetric.AggGauge
ValBytes *aggmetric.AggGauge
Expand Down Expand Up @@ -2225,6 +2234,33 @@ type TenantsStorageMetrics struct {
tenants syncutil.IntMap // map[int64(roachpb.TenantID)]*tenantStorageMetrics
}

// tenantsStorageMetricsSet returns the set of all metric names contained
// within TenantsStorageMetrics.
//
// see kvbase.TenantsStorageMetricsSet for public access. Assigned in init().
func tenantsStorageMetricsSet() map[string]struct{} {
return map[string]struct{}{
metaLiveBytes.Name: {},
metaKeyBytes.Name: {},
metaValBytes.Name: {},
metaRangeKeyBytes.Name: {},
metaRangeValBytes.Name: {},
metaTotalBytes.Name: {},
metaIntentBytes.Name: {},
metaLiveCount.Name: {},
metaKeyCount.Name: {},
metaValCount.Name: {},
metaRangeKeyCount.Name: {},
metaRangeValCount.Name: {},
metaIntentCount.Name: {},
metaIntentAge.Name: {},
metaGcBytesAge.Name: {},
metaSysBytes.Name: {},
metaSysCount.Name: {},
metaAbortSpanBytes.Name: {},
}
}

var _ metric.Struct = (*TenantsStorageMetrics)(nil)

// MetricStruct makes TenantsStorageMetrics a metric.Struct.
Expand Down
6 changes: 6 additions & 0 deletions pkg/server/status/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,10 @@ go_library(
"//pkg/build",
"//pkg/keys",
"//pkg/kv",
"//pkg/kv/kvbase",
"//pkg/kv/kvpb",
"//pkg/kv/kvserver/liveness",
"//pkg/multitenant",
"//pkg/roachpb",
"//pkg/rpc",
"//pkg/server/status/statuspb",
Expand All @@ -69,6 +71,7 @@ go_library(
"@com_github_cockroachdb_redact//:redact",
"@com_github_dustin_go_humanize//:go-humanize",
"@com_github_elastic_gosigar//:gosigar",
"@com_github_prometheus_client_model//go",
"@com_github_shirou_gopsutil_v3//cpu",
"@com_github_shirou_gopsutil_v3//net",
] + select({
Expand Down Expand Up @@ -134,6 +137,7 @@ go_test(
deps = [
"//pkg/base",
"//pkg/build",
"//pkg/multitenant",
"//pkg/roachpb",
"//pkg/security/securityassets",
"//pkg/security/securitytest",
Expand All @@ -143,6 +147,7 @@ go_test(
"//pkg/sql/sem/catconstants",
"//pkg/testutils/serverutils",
"//pkg/ts/tspb",
"//pkg/ts/tsutil",
"//pkg/util/leaktest",
"//pkg/util/log",
"//pkg/util/log/eventpb",
Expand All @@ -151,6 +156,7 @@ go_test(
"//pkg/util/system",
"//pkg/util/timeutil",
"@com_github_kr_pretty//:pretty",
"@com_github_prometheus_client_model//go",
"@com_github_shirou_gopsutil_v3//net",
"@com_github_stretchr_testify//require",
],
Expand Down
79 changes: 79 additions & 0 deletions pkg/server/status/recorder.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ import (
"github.com/cockroachdb/cockroach/pkg/build"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvbase"
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness"
"github.com/cockroachdb/cockroach/pkg/multitenant"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/rpc"
"github.com/cockroachdb/cockroach/pkg/server/status/statuspb"
Expand All @@ -49,6 +51,7 @@ import (
"github.com/cockroachdb/redact"
"github.com/dustin/go-humanize"
"github.com/elastic/gosigar"
prometheusgo "github.com/prometheus/client_model/go"
)

const (
Expand Down Expand Up @@ -385,6 +388,7 @@ func (mr *MetricsRecorder) GetTimeSeriesData() []tspb.TimeSeriesData {
}

// Record time series from store-level registries.
tIDLabel := multitenant.TenantIDLabel
for storeID, r := range mr.mu.storeRegistries {
storeRecorder := registryRecorder{
registry: r,
Expand All @@ -393,6 +397,21 @@ func (mr *MetricsRecorder) GetTimeSeriesData() []tspb.TimeSeriesData {
timestampNanos: now.UnixNano(),
}
storeRecorder.record(&data)

// Now record secondary tenant store metrics, if any exist in the process.
for tenantID := range mr.mu.tenantRegistries {
tenantStoreRecorder := registryRecorder{
registry: r,
format: storeTimeSeriesPrefix,
source: tsutil.MakeTenantSource(storeID.String(), tenantID.String()),
timestampNanos: now.UnixNano(),
}
tenantID := tenantID.String()
tenantStoreRecorder.recordChild(&data, kvbase.TenantsStorageMetricsSet, &prometheusgo.LabelPair{
Name: &tIDLabel,
Value: &tenantID,
})
}
}
atomic.CompareAndSwapInt64(&mr.lastDataCount, lastDataCount, int64(len(data)))
return data
Expand Down Expand Up @@ -659,6 +678,66 @@ func (rr registryRecorder) record(dest *[]tspb.TimeSeriesData) {
})
}

// recordChild filters the metrics in the registry down to those provided in
// the metricsFilter argument, and iterates through any child metrics that
// may exist on said metric. Child metrics whose label sets contains a match
// against the childLabelFilter are recorded into the provided dest slice of
// type tspb.TimeSeriesData.
//
// NB: Only available for Counter and Gauge metrics.
func (rr registryRecorder) recordChild(
dest *[]tspb.TimeSeriesData,
metricsFilter map[string]struct{},
childLabelFilter *prometheusgo.LabelPair,
) {
labels := rr.registry.GetLabels()
rr.registry.Select(metricsFilter, func(name string, v interface{}) {
prom, ok := v.(metric.PrometheusExportable)
if !ok {
return
}
promIter, ok := v.(metric.PrometheusIterable)
if !ok {
return
}
m := prom.ToPrometheusMetric()
m.Label = append(labels, prom.GetLabels()...)

processChildMetric := func(metric *prometheusgo.Metric) {
found := false
for _, label := range metric.Label {
if label.GetName() == childLabelFilter.GetName() &&
label.GetValue() == childLabelFilter.GetValue() {
found = true
break
}
}
if !found {
return
}
var value float64
if metric.Gauge != nil {
value = *metric.Gauge.Value
} else if metric.Counter != nil {
value = *metric.Counter.Value
} else {
return
}
*dest = append(*dest, tspb.TimeSeriesData{
Name: fmt.Sprintf(rr.format, prom.GetName()),
Source: rr.source,
Datapoints: []tspb.TimeSeriesDatapoint{
{
TimestampNanos: rr.timestampNanos,
Value: value,
},
},
})
}
promIter.Each(m.Label, processChildMetric)
})
}

// GetTotalMemory returns either the total system memory (in bytes) or if
// possible the cgroups available memory.
func GetTotalMemory(ctx context.Context) (int64, error) {
Expand Down
Loading

0 comments on commit 0d15e0d

Please sign in to comment.