From dfe73f6eb199767777fce52439a0db642d8746ab Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Mon, 17 Jan 2022 21:54:51 -0500 Subject: [PATCH 1/3] backupccl: de-flake TestPaginatedBackupTenant This test readily fails under stress with span configs enabled (#73876). Now that we split within the tenant keyspace for tenant tables, the export requests generated for each BACKUP is liable to be retried if concurrent with range splits. The splits causes KV to error out with RangeKeyMismatchErrors, at which point the request is internally retried. The test, which wants to assert on what ExportSpan requests are intercepted, previously did not need to consider these retries/need to de-dup -- it now does. Release note: None --- pkg/ccl/backupccl/backup_test.go | 51 ++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/pkg/ccl/backupccl/backup_test.go b/pkg/ccl/backupccl/backup_test.go index 3b63be137c98..047b6dbc312b 100644 --- a/pkg/ccl/backupccl/backup_test.go +++ b/pkg/ccl/backupccl/backup_test.go @@ -767,7 +767,7 @@ func TestBackupRestoreAppend(t *testing.T) { sqlDB.ExpectErr(t, "cannot append a backup of specific", "BACKUP system.users TO ($1, $2, "+ "$3)", test.backups...) - //TODO(dt): prevent backing up different targets to same collection? + // TODO(dt): prevent backing up different targets to same collection? sqlDB.Exec(t, "DROP DATABASE data CASCADE") sqlDB.Exec(t, "RESTORE DATABASE data FROM ($1, $2, $3)", test.backups...) @@ -6793,7 +6793,14 @@ func TestPaginatedBackupTenant(t *testing.T) { serverArgs := base.TestServerArgs{Knobs: base.TestingKnobs{JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals()}} params := base.TestClusterArgs{ServerArgs: serverArgs} var numExportRequests int - exportRequestSpans := make([]string, 0) + + mu := struct { + syncutil.Mutex + exportRequestSpansSet map[string]struct{} + exportRequestSpans []string + }{} + mu.exportRequestSpansSet = make(map[string]struct{}) + mu.exportRequestSpans = make([]string, 0) requestSpanStr := func(span roachpb.Span, timestamp hlc.Timestamp) string { spanStr := "" @@ -6816,10 +6823,15 @@ func TestPaginatedBackupTenant(t *testing.T) { for _, ru := range request.Requests { if exportRequest, ok := ru.GetInner().(*roachpb.ExportRequest); ok && !isLeasingExportRequest(exportRequest) { - exportRequestSpans = append( - exportRequestSpans, - requestSpanStr(roachpb.Span{Key: exportRequest.Key, EndKey: exportRequest.EndKey}, exportRequest.ResumeKeyTS), - ) + + mu.Lock() + defer mu.Unlock() + req := requestSpanStr(roachpb.Span{Key: exportRequest.Key, EndKey: exportRequest.EndKey}, exportRequest.ResumeKeyTS) + if _, found := mu.exportRequestSpansSet[req]; found { + return nil // nothing to do + } + mu.exportRequestSpansSet[req] = struct{}{} + mu.exportRequestSpans = append(mu.exportRequestSpans, req) numExportRequests++ } } @@ -6846,8 +6858,12 @@ func TestPaginatedBackupTenant(t *testing.T) { _ = security.EmbeddedTenantIDs() resetStateVars := func() { + mu.Lock() + defer mu.Unlock() + numExportRequests = 0 - exportRequestSpans = exportRequestSpans[:0] + mu.exportRequestSpansSet = make(map[string]struct{}) + mu.exportRequestSpans = mu.exportRequestSpans[:0] } _, conn10 := serverutils.StartTenant(t, srv, @@ -6863,26 +6879,27 @@ func TestPaginatedBackupTenant(t *testing.T) { systemDB.Exec(t, `SET CLUSTER SETTING kv.bulk_sst.max_allowed_overage='0b'`) tenant10.Exec(t, `BACKUP DATABASE foo TO 'userfile://defaultdb.myfililes/test'`) - require.Equal(t, 1, numExportRequests) startingSpan := roachpb.Span{Key: []byte("/Tenant/10/Table/56/1"), EndKey: []byte("/Tenant/10/Table/56/2")} - require.Equal(t, exportRequestSpans, []string{startingSpan.String()}) + mu.Lock() + require.Equal(t, []string{startingSpan.String()}, mu.exportRequestSpans) + mu.Unlock() resetStateVars() // Two ExportRequests with one resume span. systemDB.Exec(t, `SET CLUSTER SETTING kv.bulk_sst.target_size='50b'`) tenant10.Exec(t, `BACKUP DATABASE foo TO 'userfile://defaultdb.myfililes/test2'`) - require.Equal(t, 2, numExportRequests) startingSpan = roachpb.Span{Key: []byte("/Tenant/10/Table/56/1"), EndKey: []byte("/Tenant/10/Table/56/2")} resumeSpan := roachpb.Span{Key: []byte("/Tenant/10/Table/56/1/510/0"), EndKey: []byte("/Tenant/10/Table/56/2")} - require.Equal(t, exportRequestSpans, []string{startingSpan.String(), resumeSpan.String()}) + mu.Lock() + require.Equal(t, []string{startingSpan.String(), resumeSpan.String()}, mu.exportRequestSpans) + mu.Unlock() resetStateVars() // One ExportRequest for every KV. systemDB.Exec(t, `SET CLUSTER SETTING kv.bulk_sst.target_size='10b'`) tenant10.Exec(t, `BACKUP DATABASE foo TO 'userfile://defaultdb.myfililes/test3'`) - require.Equal(t, 5, numExportRequests) var expected []string for _, resume := range []exportResumePoint{ {[]byte("/Tenant/10/Table/56/1"), []byte("/Tenant/10/Table/56/2"), withoutTS}, @@ -6893,7 +6910,9 @@ func TestPaginatedBackupTenant(t *testing.T) { } { expected = append(expected, requestSpanStr(roachpb.Span{Key: resume.key, EndKey: resume.endKey}, resume.timestamp)) } - require.Equal(t, expected, exportRequestSpans) + mu.Lock() + require.Equal(t, expected, mu.exportRequestSpans) + mu.Unlock() resetStateVars() tenant10.Exec(t, `CREATE DATABASE baz; CREATE TABLE baz.bar(i int primary key, v string); INSERT INTO baz.bar VALUES (110, 'a'), (210, 'b'), (310, 'c'), (410, 'd'), (510, 'e')`) @@ -6920,7 +6939,9 @@ func TestPaginatedBackupTenant(t *testing.T) { } { expected = append(expected, requestSpanStr(roachpb.Span{Key: resume.key, EndKey: resume.endKey}, resume.timestamp)) } - require.Equal(t, expected, exportRequestSpans) + mu.Lock() + require.Equal(t, expected, mu.exportRequestSpans) + mu.Unlock() resetStateVars() // TODO(adityamaru): Add a RESTORE inside tenant once it is supported. @@ -8885,7 +8906,7 @@ DROP TABLE foo; } // TestRestoreNewDatabaseName tests the new_db_name optional feature for single database -//restores, which allows the user to rename the database they intend to restore. +// restores, which allows the user to rename the database they intend to restore. func TestRestoreNewDatabaseName(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) From 6c30e386032616bb7f20c5b6c6004c4ec82698ec Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Mon, 17 Jan 2022 23:09:40 -0500 Subject: [PATCH 2/3] server: de-flake TestStatusAPICombinedTransactions This test got a lot slower under the span configs infrastructure, something we're investigating as part of #74919. Until then, prevent this test from spoiling builds. Release note: None --- pkg/server/status_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/server/status_test.go b/pkg/server/status_test.go index f3ba499aace6..a59ea2bf9edd 100644 --- a/pkg/server/status_test.go +++ b/pkg/server/status_test.go @@ -1478,6 +1478,7 @@ func TestStatusAPICombinedTransactions(t *testing.T) { defer log.Scope(t).Close(t) params, _ := tests.CreateTestServerParams() + params.Knobs.SpanConfig = &spanconfig.TestingKnobs{ManagerDisableJobCreation: true} // TODO(irfansharif): #74919. testCluster := serverutils.StartNewTestCluster(t, 3, base.TestClusterArgs{ ServerArgs: params, }) From 32183532321e37d7a03f135c83a9b26c28454f21 Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Mon, 17 Jan 2022 23:16:51 -0500 Subject: [PATCH 3/3] kvserver: squash data race Fixes #74932; we were reading from the replicas map without having acquired the prerequisite lock. Release note: None --- pkg/kv/kvserver/store.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 893094aaa550..190ea155c6f4 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -2336,6 +2336,9 @@ func (s *Store) onSpanConfigUpdate(ctx context.Context, updated roachpb.Span) { } now := s.cfg.Clock.NowAsClockTimestamp() + + s.mu.RLock() + defer s.mu.RUnlock() if err := s.mu.replicasByKey.VisitKeyRange(ctx, sp.Key, sp.EndKey, AscendingKeyOrder, func(ctx context.Context, it replicaOrPlaceholder) error { repl := it.repl