Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
117703: allocatorimpl: lower default io overload lease shed threshold to 0.5 r=andrewbaptist a=kvoli

transfers and replica rebalancing to `0.3` and `0.4` respectively. By the same rationale in cockroachdb#113667, lower the default threshold for shedding leases based on a store's IO overload from `0.9` to `0.5`. Note lease shedding on IO overload is not enabled by default.

Epic: None
Release note: None

118249: roachtest: deflake upgrade_skip_version r=rafiss a=rafiss

Give more time for the upgrade to complete, since it seems to take a while in CI. This is done by adding an option to customize the retry duration.

fixes cockroachdb#118153
Release note: None

Co-authored-by: Austen McClernon <austen@cockroachlabs.com>
Co-authored-by: Rafi Shamim <rafi@cockroachlabs.com>
  • Loading branch information
3 people committed Jan 23, 2024
3 parents f213b9c + 5bb1a5d + 0fd0269 commit e6a9c92
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 12 deletions.
4 changes: 2 additions & 2 deletions pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ const (
// DefaultLeaseIOOverloadShedThreshold is used to shed leases from stores
// with an IO overload score greater than the this threshold. This is
// typically used in conjunction with IOOverloadMeanThreshold below.
DefaultLeaseIOOverloadShedThreshold = 0.9
DefaultLeaseIOOverloadShedThreshold = 0.5

// IOOverloadMeanThreshold is the percentage above the mean after which a
// store could be conisdered IO overload if also exceeding the absolute IO
Expand Down Expand Up @@ -196,7 +196,7 @@ var LeaseIOOverloadShedThreshold = settings.RegisterFloatSetting(
"kv.allocator.lease_shed_io_overload_threshold",
"a store will shed its leases and receive no new leases when its "+
"IO overload score is above this value and "+
"`kv.allocator.io_overload_threshold_enforcement_leases` is `shed`",
"`kv.allocator.lease_io_overload_threshold_enforcement` is `shed`",
DefaultLeaseIOOverloadShedThreshold,
)

Expand Down
43 changes: 34 additions & 9 deletions pkg/sql/logictest/logic.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,11 +306,12 @@ import (
// - colnames: column names are verified (the expected column names
// are the first line in the expected results).
// - retry: if the expected results do not match the actual results, the
// test will be retried with exponential backoff up to some maximum
// duration. If the test succeeds at any time during that period, it
// is considered successful. Otherwise, it is a failure. See
// testutils.SucceedsSoon for more information. If run with the
// -rewrite flag, the query will be run only once after a 2s sleep.
// test will be retried with exponential backoff up to the duration
// given by retry_duration. If the test succeeds at any time during
// that period, it is considered successful. Otherwise, it is a
// failure. See testutils.SucceedsSoon for more information. If run
// with the -rewrite flag, the query will be run only once after a
// 2s sleep.
// - async: runs the query asynchronously, marking it as a pending
// query using the label parameter as a unique name, to be completed
// and validated later with "awaitquery". This is intended for use
Expand Down Expand Up @@ -418,6 +419,10 @@ import (
// duration until the test passes, or the alloted time has elapsed.
// This is similar to the retry option of the query directive.
//
// - retry_duration <duration>
// Specifies the amount of time to retry when using the retry directive.
// Defaults to testutils.DefaultSucceedsSoonDuration (45 seconds).
//
// The overall architecture of TestLogic is as follows:
//
// - TestLogic() selects the input files and instantiates
Expand Down Expand Up @@ -1080,6 +1085,10 @@ type logicTest struct {
// to false after every successful statement or query test point, including
// those which are supposed to error out.
retry bool

// retryDuration is the maximum duration to retry a statement when using
// the retry directive.
retryDuration time.Duration
}

func (t *logicTest) t() *testing.T {
Expand Down Expand Up @@ -1876,6 +1885,7 @@ func (t *logicTest) setup(
tempExternalIODir, tempExternalIODirCleanup := testutils.TempDir(t.rootT)
t.sharedIODir = tempExternalIODir
t.testCleanupFuncs = append(t.testCleanupFuncs, tempExternalIODirCleanup)
t.retryDuration = testutils.DefaultSucceedsSoonDuration

if cfg.UseCockroachGoTestserver {
skip.UnderRace(t.t(), "test uses a different binary, so the race detector doesn't work")
Expand Down Expand Up @@ -2483,6 +2493,21 @@ func (t *logicTest) processSubtest(
)
}
repeat = count

case "retry_duration":
var duration time.Duration
var err error
if len(fields) != 2 {
err = errors.New("invalid line format")
} else {
duration, err = time.ParseDuration(fields[1])
}
if err != nil {
return errors.Wrapf(err, "%s:%d invalid retry_duration line",
path, s.Line+subtest.lineLineIndexIntoFile)
}
t.retryDuration = duration

case "skip_on_retry":
t.skipOnRetry = true

Expand Down Expand Up @@ -2567,12 +2592,12 @@ func (t *logicTest) processSubtest(
var cont bool
var err error
if t.retry {
err = testutils.SucceedsSoonError(func() error {
err = testutils.SucceedsWithinError(func() error {
t.purgeZoneConfig()
var tempErr error
cont, tempErr = t.execStatement(stmt)
return tempErr
})
}, t.retryDuration)
} else {
cont, err = t.execStatement(stmt)
}
Expand Down Expand Up @@ -2903,10 +2928,10 @@ func (t *logicTest) processSubtest(

for i := 0; i < repeat; i++ {
if t.retry && !*rewriteResultsInTestfiles {
if err := testutils.SucceedsSoonError(func() error {
if err := testutils.SucceedsWithinError(func() error {
t.purgeZoneConfig()
return t.execQuery(query)
}); err != nil {
}, t.retryDuration); err != nil {
t.Error(err)
}
} else {
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/logictest/testdata/logic_test/upgrade_skip_version
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ upgrade all

# We have seen that upgrades can take a long time in CI. Give some extra time
# for the upgrade to complete.
sleep 30s
retry_duration 10m

# Verify that the cluster is upgrading to 24.1.
query T retry
Expand Down

0 comments on commit e6a9c92

Please sign in to comment.