Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachtest: opt some tests into aggressive stats checks #87106

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ ALL_TESTS = [
"//pkg/cmd/reduce/reduce:reduce_test",
"//pkg/cmd/release:release_test",
"//pkg/cmd/roachtest/clusterstats:clusterstats_test",
"//pkg/cmd/roachtest/registry:registry_test",
"//pkg/cmd/roachtest/tests:tests_test",
"//pkg/cmd/roachtest:roachtest_test",
"//pkg/cmd/teamcity-trigger:teamcity-trigger_test",
Expand Down Expand Up @@ -975,6 +976,7 @@ GO_TARGETS = [
"//pkg/cmd/roachtest/clusterstats:clusterstats_test",
"//pkg/cmd/roachtest/option:option",
"//pkg/cmd/roachtest/registry:registry",
"//pkg/cmd/roachtest/registry:registry_test",
"//pkg/cmd/roachtest/roachtestutil:roachtestutil",
"//pkg/cmd/roachtest/spec:spec",
"//pkg/cmd/roachtest/test:test",
Expand Down
8 changes: 7 additions & 1 deletion pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
"github.com/armon/circbuf"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
Expand Down Expand Up @@ -1371,7 +1372,12 @@ func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, t *testImpl)
if err := contextutil.RunWithTimeout(
ctx, "consistency check", 5*time.Minute,
func(ctx context.Context) error {
return roachtestutil.CheckReplicaDivergenceOnDB(ctx, t.L(), db)
err := roachtestutil.CheckReplicaDivergenceOnDB(ctx, t.L(), db, t.Spec().(*registry.TestSpec).FullConsistencyCheck)
if err != nil {
// Failed consistency checks are always KV's problem.
err = registry.WrapWithOwner(err, registry.OwnerKV)
}
return err
},
); err != nil {
t.Errorf("consistency check failed: %v", err)
Expand Down
61 changes: 36 additions & 25 deletions pkg/cmd/roachtest/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (

"github.com/cockroachdb/cockroach/pkg/cmd/internal/issues"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/internal/team"
rperrors "github.com/cockroachdb/cockroach/pkg/roachprod/errors"
Expand All @@ -33,14 +34,6 @@ type githubIssues struct {
teamLoader func() (team.Map, error)
}

type issueCategory int

const (
otherErr issueCategory = iota
clusterCreationErr
sshErr
)

func newGithubIssues(
disable bool, c *clusterImpl, vmCreateOpts *vm.CreateOpts, l *logger.Logger,
) *githubIssues {
Expand Down Expand Up @@ -68,26 +61,27 @@ func (g *githubIssues) shouldPost(t test.Test) bool {
t.Spec().(*registry.TestSpec).Cluster.NodeCount > 0
}

type issueOverride struct {
owner registry.Owner
testName string
msgPrefix string
}

func (g *githubIssues) createPostRequest(
t test.Test, cat issueCategory, message string,
t test.Test, override issueOverride, message string,
) issues.PostRequest {
var mention []string
var projColID int

issueOwner := t.Spec().(*registry.TestSpec).Owner
if o := override.owner; o != "" {
issueOwner = o
}
issueName := t.Name()

messagePrefix := ""
// Overrides to shield eng teams from potential flakes
if cat == clusterCreationErr {
issueOwner = registry.OwnerDevInf
issueName = "cluster_creation"
messagePrefix = fmt.Sprintf("test %s was skipped due to ", t.Name())
} else if cat == sshErr {
issueOwner = registry.OwnerTestEng
issueName = "ssh_problem"
messagePrefix = fmt.Sprintf("test %s failed due to ", t.Name())
if n := override.testName; n != "" {
issueName = n
}
messagePrefix := override.msgPrefix

teams, err := g.teamLoader()
if err != nil {
Expand Down Expand Up @@ -161,19 +155,36 @@ func (g *githubIssues) MaybePost(t *testImpl, message string) error {
return nil
}

cat := otherErr

var o issueOverride
// Overrides to shield eng teams from potential flakes
firstFailure := t.firstFailure()
if failureContainsError(firstFailure, errClusterProvisioningFailed) {
cat = clusterCreationErr
o.owner = registry.OwnerDevInf
o.testName = "cluster_creation"
o.msgPrefix = fmt.Sprintf("test %s was skipped due to ", t.Name())
} else if failureContainsError(firstFailure, rperrors.ErrSSH255) {
cat = sshErr
o.owner = registry.OwnerTestEng
o.testName = "ssh_problem"
o.msgPrefix = fmt.Sprintf("test %s failed due to ", t.Name())
} else if failureContainsError(firstFailure, roachtestutil.ErrMarkConsistencyCheckFailed) {
o.owner = registry.OwnerKV
o.testName = "consistency_check"
o.msgPrefix = fmt.Sprintf("consistency check failed after running %s", t.Name())
} else {
for _, err := range firstFailure.errors {
owner, ok := registry.OwnerFromErr(err)
if !ok {
continue
}
o.owner = owner
o.msgPrefix = "owner overridden by error"
break
}
}

return g.issuePoster(
context.Background(),
issues.UnitTestFormatter,
g.createPostRequest(t, cat, message),
g.createPostRequest(t, o, message),
)
}
17 changes: 16 additions & 1 deletion pkg/cmd/roachtest/registry/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data")
load("@io_bazel_rules_go//go:def.bzl", "go_library")
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")

go_library(
name = "registry",
Expand All @@ -17,6 +17,21 @@ go_library(
"//pkg/cmd/roachtest/cluster",
"//pkg/cmd/roachtest/spec",
"//pkg/cmd/roachtest/test",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_errors//errbase",
"@com_github_cockroachdb_errors//errorspb",
"@com_github_gogo_protobuf//proto",
],
)

go_test(
name = "registry_test",
srcs = ["owners_test.go"],
args = ["-test.timeout=295s"],
embed = [":registry"],
deps = [
"@com_github_cockroachdb_errors//:errors",
"@com_github_stretchr_testify//require",
],
)

Expand Down
85 changes: 85 additions & 0 deletions pkg/cmd/roachtest/registry/owners.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@

package registry

import (
"context"
"fmt"

"github.com/cockroachdb/errors"
"github.com/cockroachdb/errors/errbase"
"github.com/cockroachdb/errors/errorspb"
"github.com/gogo/protobuf/proto"
)

// Owner is a valid entry for the Owners field of a roachtest. They should be
// teams, not individuals.
type Owner string
Expand All @@ -31,3 +41,78 @@ const (
OwnerDevInf Owner = `dev-inf`
OwnerMultiTenant Owner = `multi-tenant`
)

func init() {
pKey := errors.GetTypeKey(&ownedError{})
errors.RegisterWrapperEncoder(pKey, encodeOwnedError)
errors.RegisterWrapperDecoder(pKey, decodeOwnedError)
}

func encodeOwnedError(
_ context.Context, err error,
) (msgPrefix string, safe []string, details proto.Message) {
var e *ownedError
errors.As(err, &e)
details = &errorspb.StringsPayload{
Details: []string{string(e.owner)},
}
return "", nil, details
}

func decodeOwnedError(
ctx context.Context, cause error, msgPrefix string, safeDetails []string, payload proto.Message,
) error {
m, ok := payload.(*errorspb.StringsPayload)
if !ok || len(m.Details) != 1 {
return nil
}
owner := Owner(m.Details[0])
return &ownedError{
owner: owner,
wrapped: cause,
}
}

type ownedError struct {
owner Owner
wrapped error
}

// Unwrap implements errors.Wrapper.
func (e *ownedError) Unwrap() error {
return e.wrapped
}

// Format implements fmt.Formatter.
func (e *ownedError) Format(f fmt.State, verb rune) {
errors.FormatError(e, f, verb)
}

// FormatError implements errors.Formatter.
func (e *ownedError) FormatError(p errbase.Printer) (next error) {
p.Printf("owned by %s: %s", e.owner, e.wrapped)
return e.wrapped
}

func (e *ownedError) Error() string {
return fmt.Sprint(e)
}

// WrapWithOwner wraps the error with a hint at who should own the
// problem. The hint can later be retrieved via OwnerFromErr.
func WrapWithOwner(err error, owner Owner) error {
return &ownedError{
owner: owner,
wrapped: err,
}
}

// OwnerFromErr returns the Owner associated previously via WrapWithOwner, to the error,
// if any.
func OwnerFromErr(err error) (Owner, bool) {
var oe *ownedError
if errors.As(err, &oe) {
return oe.owner, true
}
return "", false
}
43 changes: 43 additions & 0 deletions pkg/cmd/roachtest/registry/owners_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
//

package registry

import (
"context"
"testing"

"github.com/cockroachdb/errors"
"github.com/stretchr/testify/require"
)

func TestOwnedError(t *testing.T) {
ctx := context.Background()
err := errors.DecodeError(ctx, errors.EncodeError(
ctx, WrapWithOwner(context.Canceled, OwnerTestEng),
))
t.Log(err)
require.True(t, errors.HasType(err, (*ownedError)(nil)))
require.True(t, errors.Is(err, context.Canceled))
{
owner, ok := OwnerFromErr(err)
require.True(t, ok)
require.Equal(t, OwnerTestEng, owner)
}

// Last owner wins.
err = WrapWithOwner(err, OwnerKV)
{
owner, ok := OwnerFromErr(err)
require.True(t, ok)
require.Equal(t, OwnerKV, owner)
}
}
5 changes: 5 additions & 0 deletions pkg/cmd/roachtest/registry/test_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ type TestSpec struct {
// cannot be run with encryption enabled.
EncryptionSupport EncryptionSupport

// When FullConsistencyCheck is set, after a successful run of the test
// roachtest will run a full KV consistency check and mark the test as
// failed if that check discovers any discrepancies.
FullConsistencyCheck bool

// Run is the test function.
Run func(ctx context.Context, t test.Test, c cluster.Cluster)
}
Expand Down
30 changes: 22 additions & 8 deletions pkg/cmd/roachtest/roachtestutil/consistency_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,37 @@ package roachtestutil
import (
"context"
gosql "database/sql"
"fmt"

"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/errors"
)

// ErrMarkConsistencyCheckFailed marks errors that indicate a replica divergence
// problem detected in CheckReplicaDivergenceOnDB.
var ErrMarkConsistencyCheckFailed = errors.New("consistency check failed")

// CheckReplicaDivergenceOnDB runs a stats-only consistency check via the
// provided DB. It ignores transient errors that can result from the
// implementation of crdb_internal.check_consistency, so a nil result
// does not prove anything.
func CheckReplicaDivergenceOnDB(ctx context.Context, l *logger.Logger, db *gosql.DB) error {
// provided DB. It ignores any errors running the checks and will only propagate
// an error resulting from an actual detected discrepancy. In other words, a nil
// result does not prove anything.
func CheckReplicaDivergenceOnDB(
ctx context.Context, l *logger.Logger, db *gosql.DB, statsOnly bool,
) error {
// NB: we set a statement_timeout since context cancellation won't work here,
// see:
// https://github.com/cockroachdb/cockroach/pull/34520
//
// We've seen the consistency checks hang indefinitely in some cases.
rows, err := db.QueryContext(ctx, `
//
// NB: can't use prepared statements since there are two statements here,
// so Sprintf it is.
rows, err := db.QueryContext(ctx, fmt.Sprintf(`
SET statement_timeout = '5m';
SELECT t.range_id, t.start_key_pretty, t.status, t.detail
FROM
crdb_internal.check_consistency(true, '', '') as t
WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`)
crdb_internal.check_consistency(%t, '', '') as t
WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`, statsOnly))
if err != nil {
// TODO(tbg): the checks can fail for silly reasons like missing gossiped
// descriptors, etc. -- not worth failing the test for. Ideally this would
Expand All @@ -58,5 +68,9 @@ WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`)
l.Printf("consistency check failed with %v; ignoring", err)
return nil
}
return finalErr

if finalErr != nil {
return errors.Mark(finalErr, ErrMarkConsistencyCheckFailed)
}
return nil
}
Loading