Skip to content

Commit

Permalink
test: fix tenant duplication utility generation numbers (#8096)
Browse files Browse the repository at this point in the history
## Problem
We have this set of test utilities which duplicate a tenant by copying
everything that's in remote storage and then attaching a tenant to the
pageserver and storage controller. When the "copied tenants" are created
on the storage controller, they start off from generation number 0. This
means that they can't see anything past that generation.

This issues has existed ever since generation numbers have been
introduced, but we've largely been lucky
for the generation to stay stable during the template tenant creation.

## Summary of Changes
Extend the storage controller debug attach hook to accept a generation
override. Use that in the tenant duplication logic to set the generation
number to something greater than the naturally reached generation. This
allows the tenants to see all layer files.
  • Loading branch information
VladLazar authored Jun 19, 2024
1 parent 5778d71 commit e7d62a2
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 3 deletions.
2 changes: 2 additions & 0 deletions control_plane/src/storage_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
pub node_id: Option<NodeId>,
pub generation_override: Option<i32>,
}

#[derive(Serialize, Deserialize)]
Expand Down Expand Up @@ -440,6 +441,7 @@ impl StorageController {
let request = AttachHookRequest {
tenant_shard_id,
node_id: Some(pageserver_id),
generation_override: None,
};

let response = self
Expand Down
3 changes: 2 additions & 1 deletion storage_controller/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1234,13 +1234,14 @@ impl Service {
let locked = self.inner.write().unwrap();
!locked.tenants.contains_key(&attach_req.tenant_shard_id)
};

if insert {
let tsp = TenantShardPersistence {
tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: 0,
generation: Some(0),
generation: attach_req.generation_override.or(Some(0)),
generation_pageserver: None,
placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
Expand Down
16 changes: 14 additions & 2 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -2159,12 +2159,19 @@ def storage_controller_ready():
return time.time() - t1

def attach_hook_issue(
self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
self,
tenant_shard_id: Union[TenantId, TenantShardId],
pageserver_id: int,
generation_override: Optional[int] = None,
) -> int:
body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
if generation_override is not None:
body["generation_override"] = generation_override

response = self.request(
"POST",
f"{self.env.storage_controller_api}/debug/v1/attach-hook",
json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
json=body,
headers=self.headers(TokenScope.ADMIN),
)
gen = response.json()["gen"]
Expand Down Expand Up @@ -2635,6 +2642,7 @@ def tenant_attach(
config: None | Dict[str, Any] = None,
config_null: bool = False,
generation: Optional[int] = None,
override_storage_controller_generation: bool = False,
):
"""
Tenant attachment passes through here to acquire a generation number before proceeding
Expand All @@ -2643,6 +2651,10 @@ def tenant_attach(
client = self.http_client()
if generation is None:
generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
elif override_storage_controller_generation:
generation = self.env.storage_controller.attach_hook_issue(
tenant_id, self.id, generation
)
return client.tenant_attach(
tenant_id,
config,
Expand Down
2 changes: 2 additions & 0 deletions test_runner/fixtures/pageserver/many_tenants.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ def attach_broken(tenant):
env.pageserver.tenant_attach(
tenant,
config=template_config.copy(),
generation=100,
override_storage_controller_generation=True,
)
time.sleep(0.1)
wait_until_tenant_state(ps_http, tenant, "Broken", 10)
Expand Down

1 comment on commit e7d62a2

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3316 tests run: 3187 passed, 3 failed, 126 skipped (full report)


Failures on Postgres 15

  • test_storage_controller_heartbeats[failure2]: debug

Failures on Postgres 14

  • test_basebackup_with_high_slru_count[github-actions-selfhosted-vectored-10-13-30]: release
  • test_pageserver_max_throughput_getpage_at_latest_lsn[github-actions-selfhosted-10-6-30]: release
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_basebackup_with_high_slru_count[release-pg14-github-actions-selfhosted-vectored-10-13-30] or test_pageserver_max_throughput_getpage_at_latest_lsn[release-pg14-github-actions-selfhosted-10-6-30] or test_storage_controller_heartbeats[debug-pg15-failure2]"

Test coverage report is not available

The comment gets automatically updated with the latest test results
e7d62a2 at 2024-06-19T12:55:52.261Z :recycle:

Please sign in to comment.