Skip to content

Commit

Permalink
server, tenant: hot ranges structured logging for all cluster types
Browse files Browse the repository at this point in the history
This patch introduces periodic logging of hot ranges info (ie QPS and
relevant DB objects info). It is done for both dedicated and
serverless cluster types, and by default called every 4h.
Stats collection can be enabled/disabled, or changed interval
through cluster settings.

Release note: None
  • Loading branch information
koorosh committed Apr 20, 2023
1 parent b767225 commit 440aa71
Show file tree
Hide file tree
Showing 16 changed files with 567 additions and 26 deletions.
28 changes: 28 additions & 0 deletions docs/generated/eventlog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2802,6 +2802,34 @@ ChangefeedFailed events.
| `InitialScan` | The desired behavior of initial scans (ex: yes, no, only) | no |
| `Format` | The data format being emitted (ex: JSON, Avro). | no |

### `hot_ranges_stats`

An event of type `hot_ranges_stats`


| Field | Description | Sensitive |
|--|--|--|
| `RangeID` | | no |
| `Qps` | | no |
| `DatabaseName` | DatabaseName is the name of the database in which the index was created. | yes |
| `TableName` | TableName is the name of the table on which the index was created. | yes |
| `IndexName` | IndexName is the name of the index within the scope of the given table. | yes |
| `SchemaName` | SchemaName is the name of the schema in which the index was created. | yes |
| `LeaseholderNodeID` | LeaseholderNodeID indicates the Node ID that is the current leaseholder for the given range. | no |
| `WritesPerSecond` | Writes per second is the recent number of keys written per second on this range. | no |
| `ReadsPerSecond` | Reads per second is the recent number of keys read per second on this range. | no |
| `WriteBytesPerSecond` | Write bytes per second is the recent number of bytes written per second on this range. | no |
| `ReadBytesPerSecond` | Read bytes per second is the recent number of bytes read per second on this range. | no |
| `CPUTimePerSecond` | CPU time per second is the recent cpu usage in nanoseconds of this range. | no |


#### Common fields

| Field | Description | Sensitive |
|--|--|--|
| `Timestamp` | The timestamp of the event. Expressed as nanoseconds since the Unix epoch. | no |
| `EventType` | The type of the event. | no |

### `recovery_event`

An event of type `recovery_event` is an event that is logged on every invocation of BACKUP,
Expand Down
4 changes: 4 additions & 0 deletions pkg/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ ALL_TESTS = [
"//pkg/server/serverrules:serverrules_test",
"//pkg/server/settingswatcher:settingswatcher_test",
"//pkg/server/status:status_test",
"//pkg/server/structlogging:structlogging_test",
"//pkg/server/systemconfigwatcher:systemconfigwatcher_test",
"//pkg/server/telemetry:telemetry_test",
"//pkg/server/tenantsettingswatcher:tenantsettingswatcher_test",
Expand Down Expand Up @@ -1475,6 +1476,8 @@ GO_TARGETS = [
"//pkg/server/status/statuspb:statuspb",
"//pkg/server/status:status",
"//pkg/server/status:status_test",
"//pkg/server/structlogging:structlogging",
"//pkg/server/structlogging:structlogging_test",
"//pkg/server/systemconfigwatcher/systemconfigwatchertest:systemconfigwatchertest",
"//pkg/server/systemconfigwatcher:systemconfigwatcher",
"//pkg/server/systemconfigwatcher:systemconfigwatcher_test",
Expand Down Expand Up @@ -2863,6 +2866,7 @@ GET_X_DATA_TARGETS = [
"//pkg/server/settingswatcher:get_x_data",
"//pkg/server/status:get_x_data",
"//pkg/server/status/statuspb:get_x_data",
"//pkg/server/structlogging:get_x_data",
"//pkg/server/systemconfigwatcher:get_x_data",
"//pkg/server/systemconfigwatcher/systemconfigwatchertest:get_x_data",
"//pkg/server/telemetry:get_x_data",
Expand Down
1 change: 1 addition & 0 deletions pkg/server/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ go_library(
"//pkg/server/settingswatcher",
"//pkg/server/status",
"//pkg/server/status/statuspb",
"//pkg/server/structlogging",
"//pkg/server/systemconfigwatcher",
"//pkg/server/telemetry",
"//pkg/server/tenantsettingswatcher",
Expand Down
6 changes: 6 additions & 0 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/server/serverrules"
"github.com/cockroachdb/cockroach/pkg/server/status"
"github.com/cockroachdb/cockroach/pkg/server/structlogging"
"github.com/cockroachdb/cockroach/pkg/server/systemconfigwatcher"
"github.com/cockroachdb/cockroach/pkg/server/telemetry"
"github.com/cockroachdb/cockroach/pkg/server/tenantsettingswatcher"
Expand Down Expand Up @@ -102,6 +103,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/goschedstats"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/log/logcrash"
"github.com/cockroachdb/cockroach/pkg/util/metric"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/netutil"
Expand Down Expand Up @@ -2053,6 +2055,10 @@ func (s *Server) AcceptClients(ctx context.Context) error {
return err
}

if logcrash.DiagnosticsReportingEnabled.Get(&s.ClusterSettings().SV) {
structlogging.StartHotRangesLoggingScheduler(ctx, s.stopper, s.status, *s.sqlServer.internalExecutor, s.ClusterSettings())
}

s.sqlServer.isReady.Set(true)

log.Event(ctx, "server ready")
Expand Down
53 changes: 53 additions & 0 deletions pkg/server/structlogging/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data")
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")

go_library(
name = "structlogging",
srcs = ["hot_ranges_log.go"],
importpath = "github.com/cockroachdb/cockroach/pkg/server/structlogging",
visibility = ["//visibility:public"],
deps = [
"//pkg/server/serverpb",
"//pkg/settings",
"//pkg/settings/cluster",
"//pkg/sql",
"//pkg/util/log",
"//pkg/util/log/eventpb",
"//pkg/util/log/logpb",
"//pkg/util/log/logutil",
"//pkg/util/stop",
"//pkg/util/timeutil",
],
)

go_test(
name = "structlogging_test",
srcs = [
"hot_ranges_log_test.go",
"main_test.go",
],
args = ["-test.timeout=295s"],
deps = [
":structlogging",
"//pkg/base",
"//pkg/ccl",
"//pkg/kv/kvserver",
"//pkg/roachpb",
"//pkg/security/securityassets",
"//pkg/security/securitytest",
"//pkg/server",
"//pkg/server/serverpb",
"//pkg/testutils",
"//pkg/testutils/serverutils",
"//pkg/testutils/testcluster",
"//pkg/util/leaktest",
"//pkg/util/log",
"//pkg/util/log/logcrash",
"//pkg/util/log/logtestutils",
"//pkg/util/randutil",
"@com_github_cockroachdb_errors//:errors",
"@com_github_stretchr_testify//require",
],
)

get_x_data(name = "get_x_data")
130 changes: 130 additions & 0 deletions pkg/server/structlogging/hot_ranges_log.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package structlogging

import (
"context"
"time"

"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
"github.com/cockroachdb/cockroach/pkg/util/log/logpb"
"github.com/cockroachdb/cockroach/pkg/util/log/logutil"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)

// ReportTopHottestRanges limits the number of ranges to be reported per iteration
const ReportTopHottestRanges = 5

var TelemetryHotRangesStatsInterval = settings.RegisterDurationSetting(
settings.TenantWritable,
"server.telemetry.hot_ranges_stats.interval",
"the time interval to log hot ranges stats",
4*time.Hour,
settings.NonNegativeDuration,
)

var TelemetryHotRangesStatsEnabled = settings.RegisterBoolSetting(
settings.TenantWritable,
"server.telemetry.hot_ranges_stats.enabled",
"enable/disable capturing hot ranges statistics to the telemetry logging channel",
true,
)

var TelemetryHotRangesStatsLoggingDelay = settings.RegisterDurationSetting(
settings.TenantWritable,
"server.telemetry.hot_ranges_stats.logging_delay",
"the time delay between emitting individual hot ranges stats logs",
1*time.Second,
settings.NonNegativeDuration,
)

// hotRangesLoggingScheduler is responsible for logging index usage stats
// on a scheduled interval.
type hotRangesLoggingScheduler struct {
ie sql.InternalExecutor
sServer serverpb.TenantStatusServer
st *cluster.Settings
}

// StartHotRangesLoggingScheduler starts the capture index usage statistics logging scheduler.
func StartHotRangesLoggingScheduler(
ctx context.Context,
stopper *stop.Stopper,
sServer serverpb.TenantStatusServer,
ie sql.InternalExecutor,
st *cluster.Settings,
) {
scheduler := hotRangesLoggingScheduler{
ie: ie,
sServer: sServer,
st: st,
}
scheduler.start(ctx, stopper)
}

func (s *hotRangesLoggingScheduler) start(ctx context.Context, stopper *stop.Stopper) {
_ = stopper.RunAsyncTask(ctx, "hot-ranges-stats", func(ctx context.Context) {
ticker := time.NewTicker(TelemetryHotRangesStatsInterval.Get(&s.st.SV))
defer ticker.Stop()

TelemetryHotRangesStatsInterval.SetOnChange(&s.st.SV, func(ctx context.Context) {
ticker.Reset(TelemetryHotRangesStatsInterval.Get(&s.st.SV))
})

for {
select {
case <-stopper.ShouldQuiesce():
return
case <-ctx.Done():
return
case <-ticker.C:
if !TelemetryHotRangesStatsEnabled.Get(&s.st.SV) {
continue
}
resp, err := s.sServer.HotRangesV2(ctx, &serverpb.HotRangesRequest{PageSize: ReportTopHottestRanges})
if err != nil {
log.Warningf(ctx, "failed to get hot ranges: %s", err)
continue
}
var events []logpb.EventPayload
ts := timeutil.Now().UnixNano()

for _, r := range resp.Ranges {
hrEvent := &eventpb.HotRangesStats{
RangeID: int64(r.RangeID),
Qps: r.QPS,
DatabaseName: r.DatabaseName,
SchemaName: r.SchemaName,
TableName: r.TableName,
IndexName: r.IndexName,
CPUTimePerSecond: r.CPUTimePerSecond,
ReadBytesPerSecond: r.ReadBytesPerSecond,
WriteBytesPerSecond: r.WriteBytesPerSecond,
ReadsPerSecond: r.ReadsPerSecond,
WritesPerSecond: r.WritesPerSecond,
LeaseholderNodeID: int32(r.LeaseholderNodeID),
CommonEventDetails: logpb.CommonEventDetails{
Timestamp: ts,
},
}
events = append(events, hrEvent)
}
logutil.LogEventsWithDelay(ctx, events, stopper, TelemetryHotRangesStatsLoggingDelay.Get(&s.st.SV))
}
}
})
}
103 changes: 103 additions & 0 deletions pkg/server/structlogging/hot_ranges_log_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package structlogging_test

import (
"context"
"math"
"regexp"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/ccl"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/server/structlogging"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/log/logcrash"
"github.com/cockroachdb/cockroach/pkg/util/log/logtestutils"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/require"
)

func TestHotRangesStats(t *testing.T) {
ctx := context.Background()
defer leaktest.AfterTest(t)()
ccl.TestingEnableEnterprise()
defer ccl.TestingDisableEnterprise()
sc := log.ScopeWithoutShowLogs(t)
defer sc.Close(t)

cleanup := logtestutils.InstallTelemetryLogFileSink(sc, t)
defer cleanup()

s, _, _ := serverutils.StartServer(t, base.TestServerArgs{
DefaultTestTenant: base.TestTenantDisabled,
StoreSpecs: []base.StoreSpec{
base.DefaultTestStoreSpec,
base.DefaultTestStoreSpec,
base.DefaultTestStoreSpec,
},
Knobs: base.TestingKnobs{
Store: &kvserver.StoreTestingKnobs{
DisableReplicaRebalancing: true,
},
},
})
defer s.Stopper().Stop(ctx)

logcrash.DiagnosticsReportingEnabled.Override(ctx, &s.ClusterSettings().SV, true)
structlogging.TelemetryHotRangesStatsEnabled.Override(ctx, &s.ClusterSettings().SV, true)
structlogging.TelemetryHotRangesStatsInterval.Override(ctx, &s.ClusterSettings().SV, 500*time.Millisecond)
structlogging.TelemetryHotRangesStatsLoggingDelay.Override(ctx, &s.ClusterSettings().SV, 10*time.Millisecond)

tenantID := roachpb.MustMakeTenantID(2)
tt, err := s.StartTenant(ctx, base.TestTenantArgs{
TenantID: tenantID,
Settings: s.ClusterSettings(),
})
require.NoError(t, err)

testutils.SucceedsSoon(t, func() error {
ss := tt.TenantStatusServer().(serverpb.TenantStatusServer)
resp, err := ss.HotRangesV2(ctx, &serverpb.HotRangesRequest{TenantID: tenantID.String()})
if err != nil {
return err
}
if len(resp.Ranges) == 0 {
return errors.New("waiting for hot ranges to be collected")
}
return nil
})

testutils.SucceedsWithin(t, func() error {
log.Flush()
entries, err := log.FetchEntriesFromFiles(
0,
math.MaxInt64,
10000,
regexp.MustCompile(`"EventType":"hot_ranges_stats"`),
log.WithMarkedSensitiveData,
)
if err != nil {
return err
}
if len(entries) == 0 {
return errors.New("waiting for logs")
}
return nil
}, 5*time.Second)
}
Loading

0 comments on commit 440aa71

Please sign in to comment.