From a4597297e8e2955019d548bceef3d0e0cbb57f89 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 16 Oct 2020 09:56:53 -0400 Subject: [PATCH 1/2] bpo-46205: exit if no workers are alive in runtest_mp There is a race condition in runtest_mp where if a worker already pushed its final output to the queue, but is still alive, then the the main thread waits forever on the the now-empty queue. This re-checks the status of the workers after output.get() call times out (after 30 seconds). --- Lib/test/libregrtest/runtest_mp.py | 2 ++ Misc/NEWS.d/next/Tests/2022-01-07-14-06-12.bpo-46205.dnc2OC.rst | 1 + 2 files changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Tests/2022-01-07-14-06-12.bpo-46205.dnc2OC.rst diff --git a/Lib/test/libregrtest/runtest_mp.py b/Lib/test/libregrtest/runtest_mp.py index f02f56e98bcda2..eaf1b626e09468 100644 --- a/Lib/test/libregrtest/runtest_mp.py +++ b/Lib/test/libregrtest/runtest_mp.py @@ -416,6 +416,8 @@ def _get_result(self) -> QueueOutput | None: running = get_running(self.workers) if running and not self.ns.pgo: self.log('running: %s' % ', '.join(running)) + elif not any(worker.is_alive() for worker in self.workers): + return None def display_result(self, mp_result: MultiprocessResult) -> None: result = mp_result.result diff --git a/Misc/NEWS.d/next/Tests/2022-01-07-14-06-12.bpo-46205.dnc2OC.rst b/Misc/NEWS.d/next/Tests/2022-01-07-14-06-12.bpo-46205.dnc2OC.rst new file mode 100644 index 00000000000000..7c6121fb162495 --- /dev/null +++ b/Misc/NEWS.d/next/Tests/2022-01-07-14-06-12.bpo-46205.dnc2OC.rst @@ -0,0 +1 @@ +Fix hang in runtest_mp due to race condition From 5827e284742c973ec30c4aedc534a9382d9d3b91 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Mon, 10 Jan 2022 09:46:18 -0800 Subject: [PATCH 2/2] Refactor _get_result() based on @corona10's suggestion. --- Lib/test/libregrtest/runtest_mp.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Lib/test/libregrtest/runtest_mp.py b/Lib/test/libregrtest/runtest_mp.py index eaf1b626e09468..75444e48080ce5 100644 --- a/Lib/test/libregrtest/runtest_mp.py +++ b/Lib/test/libregrtest/runtest_mp.py @@ -392,16 +392,12 @@ def stop_workers(self) -> None: worker.wait_stopped(start_time) def _get_result(self) -> QueueOutput | None: - if not any(worker.is_alive() for worker in self.workers): - # all worker threads are done: consume pending results - try: - return self.output.get(timeout=0) - except queue.Empty: - return None - use_faulthandler = (self.ns.timeout is not None) timeout = PROGRESS_UPDATE - while True: + + # bpo-46205: check the status of workers every iteration to avoid + # waiting forever on an empty queue. + while any(worker.is_alive() for worker in self.workers): if use_faulthandler: faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT, exit=True) @@ -416,8 +412,12 @@ def _get_result(self) -> QueueOutput | None: running = get_running(self.workers) if running and not self.ns.pgo: self.log('running: %s' % ', '.join(running)) - elif not any(worker.is_alive() for worker in self.workers): - return None + + # all worker threads are done: consume pending results + try: + return self.output.get(timeout=0) + except queue.Empty: + return None def display_result(self, mp_result: MultiprocessResult) -> None: result = mp_result.result