Skip to content

Commit

Permalink
regionliveness: fix flakes inside TestRegionLivenessProber
Browse files Browse the repository at this point in the history
Previously, the cancellation generated by timeouts set by region
liveness probing could potentially fail with "context deadline
exceeded". This could happen in code paths where the cancellation
checker isn't used for example within the region liveness code when
pure KV calls are used. To address this, this patch allow context
cancelled errors to be treated as timeouts. Additionally, the region
liveness tests are cleaned up and have the sqlliveness TTL increased
to reduce the risk of flakes. Also the reduced probing timeouts are
only setup for short periods of time when failures are expected to
reduce risk of intermittent failures.

Fixes: cockroachdb#118465, cockroachdb#118464

Release note: None
  • Loading branch information
fqazi committed Jan 30, 2024
1 parent 1d2534b commit 3cf4bac
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 19 deletions.
42 changes: 29 additions & 13 deletions pkg/ccl/multiregionccl/regionliveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,12 @@ import (
)

const (
// Use a low probe timeout number, but intentionally only manipulate to
// this value when *failures* are expected.
testingRegionLivenessProbeTimeout = time.Second * 2
// Use a reduced TTL, but not too far to avoid having the nodes die due to liveness
// issues on overloaded systems.
defaultLivenessTTL = time.Second * 20
)

func TestRegionLivenessProber(t *testing.T) {
Expand All @@ -53,7 +58,7 @@ func TestRegionLivenessProber(t *testing.T) {
// This test forces the SQL liveness TTL be a small number,
// which makes the heartbeats even more critical. Under stress and
// race environments this test becomes even more sensitive, if
// we can't send heartbeats within 10 seconds.
// we can't send heartbeats within 20 seconds.
skip.UnderStress(t)
skip.UnderRace(t)
skip.UnderDeadlock(t)
Expand All @@ -65,7 +70,7 @@ func TestRegionLivenessProber(t *testing.T) {
makeSettings := func() *cluster.Settings {
cs := cluster.MakeTestingClusterSettings()
instancestorage.ReclaimLoopInterval.Override(ctx, &cs.SV, 150*time.Millisecond)
slbase.DefaultTTL.Override(ctx, &cs.SV, 10*time.Second)
slbase.DefaultTTL.Override(ctx, &cs.SV, defaultLivenessTTL)
regionliveness.RegionLivenessEnabled.Override(ctx, &cs.SV, true)
return cs
}
Expand All @@ -88,11 +93,11 @@ func TestRegionLivenessProber(t *testing.T) {
var tenants []serverutils.ApplicationLayerInterface
var tenantSQL []*gosql.DB
blockProbeQuery := atomic.Bool{}
defer regionliveness.TestingSetProbeLivenessTimeout(500*time.Millisecond,
defer regionliveness.TestingSetProbeLivenessTimeout(
func() {
// Timeout attempts to probe intentionally.
if blockProbeQuery.Swap(false) {
time.Sleep(2 * time.Second)
time.Sleep(testingRegionLivenessProbeTimeout)
}
})()

Expand Down Expand Up @@ -148,6 +153,11 @@ func TestRegionLivenessProber(t *testing.T) {
liveRegions, err = regionProber.QueryLiveness(ctx, testTxn)
require.NoError(t, err)
checkExpectedRegions(expectedRegions, liveRegions)
// Override the table timeout probe for testing to ensure timeout failures
// happen now.
for _, ts := range tenants {
regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeout)
}
// Probe the liveness of the region, but timeout the query
// intentionally to make it seem dead.
blockProbeQuery.Store(true)
Expand Down Expand Up @@ -213,7 +223,7 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
// This test forces the SQL liveness TTL be a small number,
// which makes the heartbeats even more critical. Under stress and
// race environments this test becomes even more sensitive, if
// we can't send heartbeats within 10 seconds.
// we can't send heartbeats within 20 seconds.
skip.UnderStress(t)
skip.UnderRace(t)
skip.UnderDeadlock(t)
Expand All @@ -225,7 +235,7 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
makeSettings := func() *cluster.Settings {
cs := cluster.MakeTestingClusterSettings()
instancestorage.ReclaimLoopInterval.Override(ctx, &cs.SV, 150*time.Millisecond)
slbase.DefaultTTL.Override(ctx, &cs.SV, 10*time.Second)
slbase.DefaultTTL.Override(ctx, &cs.SV, defaultLivenessTTL)
regionliveness.RegionLivenessEnabled.Override(ctx, &cs.SV, true)
return cs
}
Expand All @@ -239,11 +249,11 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
targetCount := atomic.Int64{}
var tenants []serverutils.ApplicationLayerInterface
var tenantSQL []*gosql.DB
defer regionliveness.TestingSetProbeLivenessTimeout(1*time.Second, func() {
defer regionliveness.TestingSetProbeLivenessTimeout(func() {
if !detectLeaseWait.Load() {
return
}
time.Sleep(time.Second * 2)
time.Sleep(testingRegionLivenessProbeTimeout)
targetCount.Swap(0)
detectLeaseWait.Swap(false)
})()
Expand Down Expand Up @@ -274,7 +284,7 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
if targetCount.Add(1) != 1 {
return
}
time.Sleep(time.Second * 2)
time.Sleep(testingRegionLivenessProbeTimeout)
}
},
},
Expand All @@ -298,10 +308,7 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
require.NoError(t, err)
}
}
// Override the table timeout probe for testing.
for _, ts := range tenants {
regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeout)
}

// Create a new table and have it used on all nodes.
_, err = tenantSQL[0].Exec("CREATE TABLE t1(j int)")
require.NoError(t, err)
Expand All @@ -312,6 +319,11 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
row := tenantSQL[0].QueryRow("SELECT 't1'::REGCLASS::OID")
var tableID int
require.NoError(t, row.Scan(&tableID))
// Override the table timeout probe for testing to ensure timeout failures
// happen now.
for _, ts := range tenants {
regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeout)
}
// Issue a schema change which should mark this region as dead, and fail
// out because our probe query will time out.
detectLeaseWait.Swap(true)
Expand Down Expand Up @@ -350,5 +362,9 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
})
return builder.BuildExistingMutableType()
})
// Restore the override to reduce the risk of failing on overloaded systems.
for _, ts := range tenants {
regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, regionliveness.RegionLivenessProbeTimeout.Default())
}
require.NoError(t, lm.WaitForNoVersion(ctx, descpb.ID(tableID), cachedDatabaseRegions, retry.Options{}))
}
9 changes: 3 additions & 6 deletions pkg/sql/regionliveness/prober.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,15 +122,11 @@ type livenessProber struct {
settings *clustersettings.Settings
}

var probeLivenessTimeout = 15 * time.Second
var testingProbeQueryCallbackFunc func()

func TestingSetProbeLivenessTimeout(newTimeout time.Duration, probeCallbackFn func()) func() {
oldTimeout := probeLivenessTimeout
probeLivenessTimeout = newTimeout
func TestingSetProbeLivenessTimeout(probeCallbackFn func()) func() {
testingProbeQueryCallbackFunc = probeCallbackFn
return func() {
probeLivenessTimeout = oldTimeout
probeCallbackFn = nil
}
}
Expand Down Expand Up @@ -316,7 +312,8 @@ func (l *livenessProber) GetProbeTimeout() (bool, time.Duration) {
// when checking for region liveness.
func IsQueryTimeoutErr(err error) bool {
return pgerror.GetPGCode(err) == pgcode.QueryCanceled ||
errors.HasType(err, (*timeutil.TimeoutError)(nil))
errors.HasType(err, (*timeutil.TimeoutError)(nil)) ||
errors.Is(err, context.Canceled)
}

// IsMissingRegionEnumErr determines if a query hit an error because of a missing
Expand Down

0 comments on commit 3cf4bac

Please sign in to comment.