-
-
Notifications
You must be signed in to change notification settings - Fork 720
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
remove server close background task grace period #6633
Changes from 5 commits
1fcc80d
d428181
6bacdbf
b9767ca
21ddc9b
2a073d5
008d66a
4a05368
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -237,36 +237,26 @@ def close(self) -> None: | |
""" | ||
self.closed = True | ||
|
||
async def stop(self, timeout: float = 1) -> None: | ||
async def stop(self) -> None: | ||
"""Close the group and stop all currently running tasks. | ||
|
||
Closes the task group and waits `timeout` seconds for all tasks to gracefully finish. | ||
After the timeout, all remaining tasks are cancelled. | ||
Closes the task group and cancels all tasks. All tasks are cancelled | ||
an additional time for each time this task is cancelled. | ||
""" | ||
self.close() | ||
|
||
current_task = asyncio.current_task(self._get_loop()) | ||
tasks_to_stop = [t for t in self._ongoing_tasks if t is not current_task] | ||
|
||
if tasks_to_stop: | ||
# Wrap gather in task to avoid Python3.8 issue, | ||
# see https://github.com/dask/distributed/pull/6478#discussion_r885696827 | ||
async def gather(): | ||
return await asyncio.gather(*tasks_to_stop, return_exceptions=True) | ||
|
||
err = None | ||
while tasks_to_stop := (self._ongoing_tasks - {current_task}): | ||
for task in tasks_to_stop: | ||
task.cancel() | ||
try: | ||
await asyncio.wait_for( | ||
gather(), | ||
timeout, | ||
) | ||
except asyncio.TimeoutError: | ||
# The timeout on gather has cancelled the tasks, so this will not hang indefinitely | ||
await asyncio.gather(*tasks_to_stop, return_exceptions=True) | ||
await asyncio.wait(tasks_to_stop) | ||
except asyncio.CancelledError as e: | ||
err = e | ||
|
||
if [t for t in self._ongoing_tasks if t is not current_task]: | ||
raise RuntimeError( | ||
f"Expected all ongoing tasks to be cancelled and removed, found {self._ongoing_tasks}." | ||
) | ||
if err is not None: | ||
raise err | ||
|
||
def __len__(self): | ||
return len(self._ongoing_tasks) | ||
|
@@ -359,7 +349,6 @@ def __init__( | |
self.counters = None | ||
self.digests = None | ||
self._ongoing_background_tasks = AsyncTaskGroup() | ||
self._ongoing_comm_handlers = AsyncTaskGroup() | ||
self._event_finished = asyncio.Event() | ||
|
||
self.listeners = [] | ||
|
@@ -523,17 +512,22 @@ def start_periodic_callbacks(self): | |
pc.start() | ||
|
||
def stop(self): | ||
if not self.__stopped: | ||
self.__stopped = True | ||
if self.__stopped: | ||
return | ||
|
||
for listener in self.listeners: | ||
self.__stopped = True | ||
_stops = set() | ||
for listener in self.listeners: | ||
future = listener.stop() | ||
if inspect.isawaitable(future): | ||
_stops.add(future) | ||
Comment on lines
+522
to
+523
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC you introduce this merely to ensure that there are no users outside that provide a listener with an async stop? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep, it's tricky to work out where to put the deprecation, as right at server stop is a bit late. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Well, better than nothing. If anybody is using this, they'll see it in their CI then. We're also not in a rush to remove this so we can let the warning sit for a while |
||
|
||
async def stop_listener(listener): | ||
v = listener.stop() | ||
if inspect.isawaitable(v): | ||
await v | ||
if _stops: | ||
|
||
self._ongoing_background_tasks.call_soon(stop_listener, listener) | ||
async def background_stops(): | ||
await asyncio.gather(*_stops) | ||
|
||
self._ongoing_background_tasks.call_soon(background_stops) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add a comment/warning here to highlight that these are likely to get cancelled due to the lack of a grace period? Alternatively, we could add a small grace period back in that would allow fast tasks to finish but have less of a performance impact? For example, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a I have no idea who would implement this asynchronously. I think the deprecation warning there should be sufficient. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think a regular DeprecationWarning would be better, otherwise you have to go via Pend -> Deprecate -> Remove |
||
|
||
@property | ||
def listener(self): | ||
|
@@ -874,13 +868,11 @@ async def close(self, timeout=None): | |
future = listener.stop() | ||
if inspect.isawaitable(future): | ||
_stops.add(future) | ||
await asyncio.gather(*_stops) | ||
|
||
# TODO: Deal with exceptions | ||
await self._ongoing_background_tasks.stop(timeout=1) | ||
if _stops: | ||
await asyncio.gather(*_stops) | ||
|
||
# TODO: Deal with exceptions | ||
await self._ongoing_comm_handlers.stop(timeout=1) | ||
await self._ongoing_background_tasks.stop() | ||
|
||
await self.rpc.close() | ||
await asyncio.gather(*[comm.close() for comm in list(self._comms)]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -156,8 +156,10 @@ def blockable_compute(x, lock): | |
await block_compute.acquire() | ||
|
||
# Close in scheduler to ensure we transition and reschedule task properly | ||
await s.close_worker(worker=a.address, stimulus_id="test") | ||
await wait_for_state(fut1.key, "resumed", b) | ||
await asyncio.gather( | ||
wait_for_state(fut1.key, "resumed", b, interval=0), | ||
s.close_worker(worker=a.address, stimulus_id="test"), | ||
) | ||
|
||
block_get_data.release() | ||
await block_compute.release() | ||
|
@@ -415,9 +417,10 @@ async def get_data(self, comm, *args, **kwargs): | |
f3.key: {w2.address}, | ||
} | ||
) | ||
await s.remove_worker(w1.address, stimulus_id="stim-id") | ||
|
||
await wait_for_state(f3.key, "resumed", w2) | ||
await asyncio.gather( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. still failing intermittently https://github.com/dask/distributed/runs/7090644567?check_suite_focus=true#step:11:1844 |
||
wait_for_state(f3.key, "resumed", w2, interval=0), | ||
s.remove_worker(w1.address, stimulus_id="stim-id"), | ||
) | ||
assert_story( | ||
w2.state.log, | ||
[ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the reasoning behind calling task.cancel() multiple times?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the current task is cancelled and any child task suppresses the cancelation then the child tasks will leak from the task group