From 5f7c9ae31a0addeec6596c6e8e9e8e50c7e54777 Mon Sep 17 00:00:00 2001 From: Azhng Date: Thu, 2 Dec 2021 11:38:35 -0500 Subject: [PATCH] statusccl: speedup unit test by reducing concurrency Previously, we spin up 1 host cluster and 2 tenant clusters for testing tenant status server. Each cluster contains 3 nodes and we effectively spin up 9 nodes for the test. This is very expensive and can cause errors like "race: limit on 8128 simultaneously alive goroutines is exceeded, dying" due to the underyling LLVM limitation. This commit reduces the total number of nodes that are spinned up. Host cluster and the second tenant cluster size is now reduced to 1 node to reduce the cost of the tests. Additional special `randomServerIdx` value is introduced to avoid having the test rely on the hard-coded server index. Release note: None --- .../serverccl/statusccl/tenant_status_test.go | 35 ++++++++-------- .../serverccl/statusccl/tenant_test_utils.go | 40 ++++++++++++++----- 2 files changed, 49 insertions(+), 26 deletions(-) diff --git a/pkg/ccl/serverccl/statusccl/tenant_status_test.go b/pkg/ccl/serverccl/statusccl/tenant_status_test.go index 41d7971461d5..cc0278799e02 100644 --- a/pkg/ccl/serverccl/statusccl/tenant_status_test.go +++ b/pkg/ccl/serverccl/statusccl/tenant_status_test.go @@ -279,21 +279,21 @@ func testResetSQLStatsRPCForTenant( t.Run(fmt.Sprintf("flushed=%t", flushed), func(t *testing.T) { // Clears the SQL Stats at the end of each test via builtin. defer func() { - testCluster.tenantConn(0 /* idx */).Exec(t, "SELECT crdb_internal.reset_sql_stats()") - controlCluster.tenantConn(0 /* idx */).Exec(t, "SELECT crdb_internal.reset_sql_stats()") + testCluster.tenantConn(randomServer).Exec(t, "SELECT crdb_internal.reset_sql_stats()") + controlCluster.tenantConn(randomServer).Exec(t, "SELECT crdb_internal.reset_sql_stats()") }() for _, stmt := range stmts { - testCluster.tenantConn(0 /* idx */).Exec(t, stmt) - controlCluster.tenantConn(0 /* idx */).Exec(t, stmt) + testCluster.tenantConn(randomServer).Exec(t, stmt) + controlCluster.tenantConn(randomServer).Exec(t, stmt) } if flushed { - testCluster.tenantSQLStats(0 /* idx */).Flush(ctx) - controlCluster.tenantSQLStats(0 /* idx */).Flush(ctx) + testCluster.tenantSQLStats(randomServer).Flush(ctx) + controlCluster.tenantSQLStats(randomServer).Flush(ctx) } - status := testCluster.tenantStatusSrv(1 /* idx */) + status := testCluster.tenantStatusSrv(randomServer) statsPreReset, err := status.Statements(ctx, &serverpb.StatementsRequest{ Combined: true, @@ -334,7 +334,7 @@ func testResetSQLStatsRPCForTenant( // Ensures that sql stats reset is isolated by tenant boundary. statsFromControlCluster, err := - controlCluster.tenantStatusSrv(1 /* idx */).Statements(ctx, &serverpb.StatementsRequest{ + controlCluster.tenantStatusSrv(randomServer).Statements(ctx, &serverpb.StatementsRequest{ Combined: true, }) require.NoError(t, err) @@ -396,11 +396,14 @@ VALUES (1, 10, 100), (2, 20, 200), (3, 30, 300) `) // Record scan on primary index. - cluster.tenantConn(0).Exec(t, "SELECT * FROM test") + cluster.tenantConn(randomServer). + Exec(t, "SELECT * FROM test") // Record scan on secondary index. - cluster.tenantConn(1).Exec(t, "SELECT * FROM test@test_a_idx") - testTableIDStr := cluster.tenantConn(2).QueryStr(t, "SELECT 'test'::regclass::oid")[0][0] + cluster.tenantConn(randomServer). + Exec(t, "SELECT * FROM test@test_a_idx") + testTableIDStr := cluster.tenantConn(randomServer). + QueryStr(t, "SELECT 'test'::regclass::oid")[0][0] testTableID, err := strconv.Atoi(testTableIDStr) require.NoError(t, err) @@ -440,12 +443,12 @@ WHERE {testTableIDStr, "1", "1", "true"}, {testTableIDStr, "2", "1", "true"}, } - cluster.tenantConn(2).CheckQueryResults(t, query, expected) + cluster.tenantConn(randomServer).CheckQueryResults(t, query, expected) } // Reset index usage stats. timePreReset := timeutil.Now() - status := testingCluster.tenantStatusSrv(1 /* idx */) + status := testingCluster.tenantStatusSrv(randomServer) // Reset index usage stats. testCase.resetFn(testHelper) @@ -457,7 +460,7 @@ WHERE // Ensure tenant data isolation. // Check that last reset time was not updated for control cluster. - status = controlCluster.tenantStatusSrv(1 /* idx */) + status = controlCluster.tenantStatusSrv(randomServer) resp, err = status.IndexUsageStatistics(ctx, &serverpb.IndexUsageStatisticsRequest{}) require.NoError(t, err) require.Equal(t, resp.LastReset, time.Time{}) @@ -486,7 +489,7 @@ WHERE } // Ensure tenant data isolation. - rows = controlCluster.tenantConn(2).QueryStr(t, query, controlTableID) + rows = controlCluster.tenantConn(0).QueryStr(t, query, controlTableID) require.NotNil(t, rows) for _, row := range rows { require.NotEqual(t, row[1], "0", "expected total reads for table %s to not be reset, but got %s", row[0], row[1]) @@ -659,7 +662,7 @@ WHERE require.Equal(t, expected, actual) // Ensure tenant data isolation. - actual = controlledCluster.tenantConn(2).QueryStr(t, query, testTableID) + actual = controlledCluster.tenantConn(0).QueryStr(t, query, testTableID) expected = [][]string{} require.Equal(t, expected, actual) diff --git a/pkg/ccl/serverccl/statusccl/tenant_test_utils.go b/pkg/ccl/serverccl/statusccl/tenant_test_utils.go index 82a31493e475..871bde2c215f 100644 --- a/pkg/ccl/serverccl/statusccl/tenant_test_utils.go +++ b/pkg/ccl/serverccl/statusccl/tenant_test_utils.go @@ -11,6 +11,7 @@ package statusccl import ( "context" gosql "database/sql" + "math/rand" "net/http" "testing" @@ -29,6 +30,13 @@ import ( "github.com/stretchr/testify/require" ) +// serverIdx is the index of the node within a test cluster. A special value +// `randomServer` can be used to let the test helper to randomly choose to +// a server from the test cluster. +type serverIdx int + +const randomServer serverIdx = -1 + type testTenant struct { tenant serverutils.TestTenantInterface tenantConn *gosql.DB @@ -85,7 +93,7 @@ func newTestTenantHelper( t.Helper() params, _ := tests.CreateTestServerParams() - testCluster := serverutils.StartNewTestCluster(t, 3 /* numNodes */, base.TestClusterArgs{ + testCluster := serverutils.StartNewTestCluster(t, 1 /* numNodes */, base.TestClusterArgs{ ServerArgs: params, }) server := testCluster.Server(0) @@ -99,10 +107,12 @@ func newTestTenantHelper( security.EmbeddedTenantIDs()[0], knobs, ), + // Spin up a small tenant cluster under a different tenant ID to test + // tenant isolation. tenantControlCluster: newTenantCluster( t, server, - tenantClusterSize, + 1, /* tenantClusterSize */ security.EmbeddedTenantIDs()[1], knobs, ), @@ -146,22 +156,22 @@ func newTenantCluster( return cluster } -func (c tenantCluster) tenantConn(idx int) *sqlutils.SQLRunner { - return c[idx].tenantDB +func (c tenantCluster) tenantConn(idx serverIdx) *sqlutils.SQLRunner { + return c.tenant(idx).tenantDB } -func (c tenantCluster) tenantHTTPClient(t *testing.T, idx int) *httpClient { - client, err := c[idx].tenant.RPCContext().GetHTTPClient() +func (c tenantCluster) tenantHTTPClient(t *testing.T, idx serverIdx) *httpClient { + client, err := c.tenant(idx).tenant.RPCContext().GetHTTPClient() require.NoError(t, err) return &httpClient{t: t, client: client, baseURL: "https://" + c[idx].tenant.HTTPAddr()} } -func (c tenantCluster) tenantSQLStats(idx int) *persistedsqlstats.PersistedSQLStats { - return c[idx].tenantSQLStats +func (c tenantCluster) tenantSQLStats(idx serverIdx) *persistedsqlstats.PersistedSQLStats { + return c.tenant(idx).tenantSQLStats } -func (c tenantCluster) tenantStatusSrv(idx int) serverpb.SQLStatusServer { - return c[idx].tenantStatus +func (c tenantCluster) tenantStatusSrv(idx serverIdx) serverpb.SQLStatusServer { + return c.tenant(idx).tenantStatus } func (c tenantCluster) cleanup(t *testing.T) { @@ -170,6 +180,16 @@ func (c tenantCluster) cleanup(t *testing.T) { } } +// tenant selects a tenant node from the tenant cluster. If randomServer +// is passed in, then a random node is selected. +func (c tenantCluster) tenant(idx serverIdx) *testTenant { + if idx == randomServer { + return c[rand.Intn(len(c))] + } + + return c[idx] +} + type httpClient struct { t *testing.T client http.Client