Skip to content

Commit

Permalink
tests: use semaphore instead of lock for Endpoint.running (#8112)
Browse files Browse the repository at this point in the history
## Problem

Ahem, let's try this again.

#8110 had a spooky failure in
test_multi_attach where a call to Endpoint.stop() timed out waiting for
a lock, even though we can see an earlier call completing and releasing
the lock. I suspect something weird is going on with the way pytest runs
tests across processes, or use of asyncio perhaps.

Anyway: the simplest fix is to just use a semaphore instead: if we don't
lock we can't deadlock.

## Summary of changes

- Make Endpoint.running a semaphore, where we add a unit to its counter
when starting the process and atomically decrement it when stopping.
  • Loading branch information
jcsp authored Jun 19, 2024
1 parent fd0b22f commit f0e2bb7
Showing 1 changed file with 26 additions and 24 deletions.
50 changes: 26 additions & 24 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -3446,11 +3446,12 @@ def __init__(
self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
# path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf

# This lock prevents concurrent start & stop operations, keeping `self.running` consistent
# with whether we're really running. Tests generally wouldn't try and do these concurrently,
# but endpoints are also stopped during test teardown, which might happen concurrently with
# destruction of objects in tests.
self.lock = threading.Lock()
# Semaphore is set to 1 when we start, and acquire'd back to zero when we stop
#
# We use a semaphore rather than a bool so that racing calls to stop() don't
# try and stop the same process twice, as stop() is called by test teardown and
# potentially by some __del__ chains in other threads.
self._running = threading.Semaphore(0)

def http_client(
self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
Expand Down Expand Up @@ -3522,15 +3523,14 @@ def start(

log.info(f"Starting postgres endpoint {self.endpoint_id}")

with self.lock:
self.env.neon_cli.endpoint_start(
self.endpoint_id,
safekeepers=self.active_safekeepers,
remote_ext_config=remote_ext_config,
pageserver_id=pageserver_id,
allow_multiple=allow_multiple,
)
self.running = True
self.env.neon_cli.endpoint_start(
self.endpoint_id,
safekeepers=self.active_safekeepers,
remote_ext_config=remote_ext_config,
pageserver_id=pageserver_id,
allow_multiple=allow_multiple,
)
self._running.release(1)

return self

Expand Down Expand Up @@ -3578,9 +3578,12 @@ def edit_hba(self, hba: List[str]):
conf_file.write("\n".join(hba) + "\n")
conf_file.write(data)

if self.running:
if self.is_running():
self.safe_psql("SELECT pg_reload_conf()")

def is_running(self):
return self._running._value > 0

def reconfigure(self, pageserver_id: Optional[int] = None):
assert self.endpoint_id is not None
self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
Expand Down Expand Up @@ -3629,13 +3632,12 @@ def stop(self, mode: str = "fast") -> "Endpoint":
Returns self.
"""

with self.lock:
if self.running:
assert self.endpoint_id is not None
self.env.neon_cli.endpoint_stop(
self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
)
self.running = False
running = self._running.acquire(blocking=False)
if running:
assert self.endpoint_id is not None
self.env.neon_cli.endpoint_stop(
self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
)

return self

Expand All @@ -3645,13 +3647,13 @@ def stop_and_destroy(self, mode: str = "immediate") -> "Endpoint":
Returns self.
"""

with self.lock:
running = self._running.acquire(blocking=False)
if running:
assert self.endpoint_id is not None
self.env.neon_cli.endpoint_stop(
self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
)
self.endpoint_id = None
self.running = False

return self

Expand Down

1 comment on commit f0e2bb7

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3316 tests run: 3181 passed, 9 failed, 126 skipped (full report)


Failures on Postgres 15

  • test_config_with_unknown_keys_is_bad_request: debug
  • test_no_config[application/json]: debug
  • test_no_config[None]: debug
  • test_null_config: debug
  • test_create_multiple_timelines_parallel: debug

Failures on Postgres 14

  • test_sharding_autosplit[github-actions-selfhosted]: release
  • test_basebackup_with_high_slru_count[github-actions-selfhosted-sequential-10-13-30]: release
  • test_basebackup_with_high_slru_count[github-actions-selfhosted-vectored-10-13-30]: release
  • test_pageserver_max_throughput_getpage_at_latest_lsn[github-actions-selfhosted-10-6-30]: release
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_sharding_autosplit[release-pg14-github-actions-selfhosted] or test_basebackup_with_high_slru_count[release-pg14-github-actions-selfhosted-sequential-10-13-30] or test_basebackup_with_high_slru_count[release-pg14-github-actions-selfhosted-vectored-10-13-30] or test_pageserver_max_throughput_getpage_at_latest_lsn[release-pg14-github-actions-selfhosted-10-6-30] or test_config_with_unknown_keys_is_bad_request[debug-pg15] or test_no_config[debug-pg15-application/json] or test_no_config[debug-pg15-None] or test_null_config[debug-pg15] or test_create_multiple_timelines_parallel[debug-pg15]"

Test coverage report is not available

The comment gets automatically updated with the latest test results
f0e2bb7 at 2024-06-19T17:29:37.535Z :recycle:

Please sign in to comment.