Skip to content

Commit

Permalink
Merge #97412
Browse files Browse the repository at this point in the history
97412: server: introduce CriticalNodes rpc r=zachlite a=zachlite

Since #90016, we've had the ability to generate multi-tenant
replication reports via `spanconfig.Reporter.SpanConfigConformance()`.

This commit leverages #90016 to provide multi-tenant friendly
observability of nodes whose replicas are unavailable or underreplicated,
and are therefore considered critical nodes.

A new `CriticalNodes` rpc is available via HTTP. The response
includes a list of node descriptors that are considered critical, and the
corresponding SpanConfigConformanceReport which includes details of
non-conforming ranges contributing to the criticality.

Epic: https://cockroachlabs.atlassian.net/browse/CRDB-10630
Release note: None

Co-authored-by: zachlite <zachlite@gmail.com>
  • Loading branch information
craig[bot] and zachlite committed Mar 2, 2023
2 parents ac95590 + 1cdb1ac commit c58dd8a
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 0 deletions.
46 changes: 46 additions & 0 deletions docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -2934,6 +2934,52 @@ Support status: [reserved](#support-status)



## CriticalNodes

`POST /_status/critical_nodes`

CriticalNodes retrieves nodes that are considered critical. A critical node
is one whose unexpected termination could result in data loss. A node is
considered critical if any of its replicas are unavailable or
under-replicated. The response includes a list of node descriptors that are
considered critical, and the corresponding SpanConfigConformanceReport that
includes details of non-conforming ranges contributing to the criticality.

Support status: [reserved](#support-status)

#### Request Parameters













#### Response Parameters







| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| critical_nodes | [cockroach.roachpb.NodeDescriptor](#cockroach.server.serverpb.CriticalNodesResponse-cockroach.roachpb.NodeDescriptor) | repeated | | [reserved](#support-status) |
| report | [cockroach.roachpb.SpanConfigConformanceReport](#cockroach.server.serverpb.CriticalNodesResponse-cockroach.roachpb.SpanConfigConformanceReport) | | | [reserved](#support-status) |







## Stacks

`GET /_status/stacks/{node_id}`
Expand Down
1 change: 1 addition & 0 deletions pkg/server/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,7 @@ go_test(
"bench_test.go",
"config_test.go",
"connectivity_test.go",
"critical_nodes_test.go",
"decommission_test.go",
"drain_test.go",
"graphite_test.go",
Expand Down
71 changes: 71 additions & 0 deletions pkg/server/critical_nodes_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package server

import (
"context"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/config/zonepb"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/require"
)

func TestCriticalNodes(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
testCluster := serverutils.StartNewTestCluster(t, 3, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{
Knobs: base.TestingKnobs{Server: &TestingKnobs{
DefaultSystemZoneConfigOverride: zonepb.DefaultZoneConfigRef(),
}},
},
})
s := testCluster.Server(0).TenantStatusServer().(serverpb.StatusServer)
ctx := context.Background()
defer testCluster.Stopper().Stop(ctx)

// Add a table, and alter its zone config to something
// that can't be satisfied, given this is a 3-node cluster.
db := testCluster.ServerConn(0)
_, err := db.Exec("CREATE TABLE test (x int PRIMARY KEY)")
require.NoError(t, err)
_, err = db.Exec("ALTER TABLE test CONFIGURE ZONE USING num_replicas = 5;")
require.NoError(t, err)

testutils.SucceedsWithin(t, func() error {
res, err := s.CriticalNodes(ctx, &serverpb.CriticalNodesRequest{})
if err != nil {
return err
}
if res.Report.IsEmpty() {
return errors.Errorf(
"expected report to not be empty, got: {over: %d, under: %d, violating: %d, unavailable: %d}",
len(res.Report.OverReplicated),
len(res.Report.UnderReplicated),
len(res.Report.ViolatingConstraints),
len(res.Report.Unavailable),
)
}

// We should expect all 3 nodes to be critical, because they all contain
// a replica for the under-replicated range.
require.Equal(t, 3, len(res.CriticalNodes))
return nil
}, 2*time.Minute)
}
1 change: 1 addition & 0 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -895,6 +895,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
clock,
distSender,
rangestats.NewFetcher(db),
node,
)

keyVisualizerServer := &KeyVisualizerServer{
Expand Down
20 changes: 20 additions & 0 deletions pkg/server/serverpb/status.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1930,6 +1930,13 @@ message ListExecutionInsightsResponse {
];
}


message CriticalNodesRequest {}
message CriticalNodesResponse {
repeated roachpb.NodeDescriptor critical_nodes = 1 [(gogoproto.nullable) = false];
roachpb.SpanConfigConformanceReport report = 2 [(gogoproto.nullable) = false];
}

service Status {
// Certificates retrieves a copy of the TLS certificates.
rpc Certificates(CertificatesRequest) returns (CertificatesResponse) {
Expand Down Expand Up @@ -2172,6 +2179,19 @@ service Status {
};
}

// CriticalNodes retrieves nodes that are considered critical. A critical node
// is one whose unexpected termination could result in data loss. A node is
// considered critical if any of its replicas are unavailable or
// under-replicated. The response includes a list of node descriptors that are
// considered critical, and the corresponding SpanConfigConformanceReport that
// includes details of non-conforming ranges contributing to the criticality.
rpc CriticalNodes(CriticalNodesRequest) returns (CriticalNodesResponse) {
option (google.api.http) = {
post: "/_status/critical_nodes"
body: "*"
};
}

// Stacks retrieves the stack traces of all goroutines on a given node.
rpc Stacks(StacksRequest) returns (JSONResponse) {
option (google.api.http) = {
Expand Down
50 changes: 50 additions & 0 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,7 @@ type systemStatusServer struct {
spanConfigReporter spanconfig.Reporter
distSender *kvcoord.DistSender
rangeStatsFetcher *rangestats.Fetcher
node *Node
}

// StmtDiagnosticsRequester is the interface into *stmtdiagnostics.Registry
Expand Down Expand Up @@ -612,6 +613,7 @@ func newSystemStatusServer(
clock *hlc.Clock,
distSender *kvcoord.DistSender,
rangeStatsFetcher *rangestats.Fetcher,
node *Node,
) *systemStatusServer {
server := newStatusServer(
ambient,
Expand Down Expand Up @@ -639,6 +641,7 @@ func newSystemStatusServer(
spanConfigReporter: spanConfigReporter,
distSender: distSender,
rangeStatsFetcher: rangeStatsFetcher,
node: node,
}
}

Expand Down Expand Up @@ -850,6 +853,53 @@ func recordedSpansToTraceEvents(spans []tracingpb.RecordedSpan) []*serverpb.Trac
return output
}

// CriticalNodes retrieves nodes that are considered critical. A critical node
// is one whose unexpected termination could result in data loss. A node is
// considered critical if any of its replicas are unavailable or
// under-replicated. The response includes a list of node descriptors that are
// considered critical, and the corresponding SpanConfigConformanceReport that
// includes details of non-conforming ranges contributing to the criticality.
func (s *systemStatusServer) CriticalNodes(
ctx context.Context, req *serverpb.CriticalNodesRequest,
) (*serverpb.CriticalNodesResponse, error) {
ctx = s.AnnotateCtx(ctx)
if _, err := s.privilegeChecker.requireAdminUser(ctx); err != nil {
return nil, err
}
conformance, err := s.node.SpanConfigConformance(
ctx, &roachpb.SpanConfigConformanceRequest{
Spans: []roachpb.Span{keys.EverythingSpan},
})
if err != nil {
return nil, err
}

critical := make(map[roachpb.NodeID]bool)
for _, r := range conformance.Report.UnderReplicated {
for _, desc := range r.RangeDescriptor.Replicas().Descriptors() {
critical[desc.NodeID] = true
}
}
for _, r := range conformance.Report.Unavailable {
for _, desc := range r.RangeDescriptor.Replicas().Descriptors() {
critical[desc.NodeID] = true
}
}

res := &serverpb.CriticalNodesResponse{
CriticalNodes: nil,
Report: conformance.Report,
}
for nodeID := range critical {
ns, err := s.nodeStatus(ctx, &serverpb.NodeRequest{NodeId: nodeID.String()})
if err != nil {
return nil, err
}
res.CriticalNodes = append(res.CriticalNodes, ns.Desc)
}
return res, nil
}

// AllocatorRange returns simulated allocator info for the requested range.
func (s *systemStatusServer) AllocatorRange(
ctx context.Context, req *serverpb.AllocatorRangeRequest,
Expand Down

0 comments on commit c58dd8a

Please sign in to comment.