Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wait for tenant to be active before polling for timeline absence #4856

Merged
merged 2 commits into from
Aug 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions test_runner/fixtures/pageserver/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,9 @@ def wait_timeline_detail_404(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
timeline_id: TimelineId,
wait_longer: bool = False,
iterations: int,
):
last_exc = None
iterations = 10 if wait_longer else 2
for _ in range(iterations):
time.sleep(0.250)
try:
Expand All @@ -220,8 +219,8 @@ def timeline_delete_wait_completed(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
timeline_id: TimelineId,
wait_longer: bool = False, # Use when running with RemoteStorageKind.REAL_S3
iterations: int = 20,
**delete_args,
):
pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer)
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
20 changes: 10 additions & 10 deletions test_runner/regress/test_timeline_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,8 @@ def test_delete_timeline_exercise_crash_safety_failpoints(

ps_http.configure_failpoints((failpoint, "return"))

iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4
LizardWizzard marked this conversation as resolved.
Show resolved Hide resolved

# These failpoints are earlier than background task is spawned.
# so they result in api request failure.
if failpoint in (
Expand All @@ -245,7 +247,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
tenant_id=env.initial_tenant,
timeline_id=timeline_id,
expected_state="Broken",
iterations=2, # effectively try immediately and retry once in one second
iterations=iterations,
)

reason = timeline_info["state"]["Broken"]["reason"]
Expand All @@ -254,29 +256,27 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
# failpoint may not be the only error in the stack
assert reason.endswith(f"failpoint: {failpoint}"), reason

wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3
if check is Check.RETRY_WITH_RESTART:
env.pageserver.stop()
env.pageserver.start()
if failpoint == "timeline-delete-before-index-deleted-at":
# We crashed before persisting this to remote storage, need to retry delete request

# Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading)
wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2)
wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations)

if failpoint == "timeline-delete-before-index-deleted-at":
# We crashed before persisting this to remote storage, need to retry delete request
timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
else:
# Pageserver should've resumed deletion after restart.
wait_timeline_detail_404(
ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
ps_http, env.initial_tenant, timeline_id, iterations=iterations
)
elif check is Check.RETRY_WITHOUT_RESTART:
# this should succeed
# this also checks that delete can be retried even when timeline is in Broken state
ps_http.configure_failpoints((failpoint, "off"))

timeline_delete_wait_completed(
ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
ps_http, env.initial_tenant, timeline_id, iterations=iterations
)

# Check remote is impty
Expand Down Expand Up @@ -569,7 +569,7 @@ def first_call(result_queue):
try:
log.info("first call start")
timeline_delete_wait_completed(
ps_http, env.initial_tenant, child_timeline_id, timeout=10
ps_http, env.initial_tenant, child_timeline_id, timeout=20
)
log.info("first call success")
result_queue.put("success")
Expand Down Expand Up @@ -683,7 +683,7 @@ def first_request_finished():
wait_until(50, 0.1, first_request_finished)

# check that the timeline is gone
wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id)
wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2)


@pytest.mark.parametrize(
Expand Down