From afc9e03149bbe9d12cc4252a17a7d835eb2384cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Stucke?= Date: Wed, 3 Jul 2024 09:00:38 +0200 Subject: [PATCH 1/5] backend start script: check ulimit and increase soft limit if possible --- src/start_fact_backend.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/start_fact_backend.py b/src/start_fact_backend.py index 590a92609..460e047a5 100755 --- a/src/start_fact_backend.py +++ b/src/start_fact_backend.py @@ -20,6 +20,7 @@ import grp import logging import os +import resource import sys from pathlib import Path @@ -37,6 +38,8 @@ from scheduler.unpacking_scheduler import UnpackingScheduler from storage.unpacking_locks import UnpackingLockManager +ULIMIT_MIN = 1_024 + class FactBackend(FactBase): PROGRAM_NAME = 'FACT Backend' @@ -47,6 +50,7 @@ def __init__(self): super().__init__() self.unpacking_lock_manager = UnpackingLockManager() self._create_docker_base_dir() + _check_ulimit() try: self.analysis_service = AnalysisScheduler(unpacking_locks=self.unpacking_lock_manager) @@ -110,6 +114,18 @@ def _exception_occurred(self): ) +def _check_ulimit(): + soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) + if hard_limit < ULIMIT_MIN: + logging.warning( + 'The open file limit appears to be low. This could lead to "too many open files" errors. Please increase ' + 'the open file hard limit for the process that runs FACT.' + ) + if soft_limit < hard_limit: + # we are only allowed to increase the soft limit and not the hard limit + resource.setrlimit(resource.RLIMIT_NOFILE, (hard_limit, hard_limit)) + + if __name__ == '__main__': FactBackend().main() sys.exit(0) From 5103c07ca4ed0d9c46c7bf5e1d73e7c24d9d8f08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Stucke?= Date: Fri, 5 Jul 2024 16:26:46 +0200 Subject: [PATCH 2/5] fact base: open FD count to memory usage log --- src/fact_base.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/fact_base.py b/src/fact_base.py index 63dd164c8..e1c5aedbd 100644 --- a/src/fact_base.py +++ b/src/fact_base.py @@ -38,6 +38,7 @@ class FactBase: def __init__(self): self.run = True + self.main_proc = psutil.Process() self.args = setup_argparser(self.PROGRAM_NAME, self.PROGRAM_DESCRIPTION) config.load(self.args.config_file) @@ -104,7 +105,7 @@ def main(self): if self.args.testing: break if not counter % 12: # only check every minute - _check_memory_usage() + self._check_resource_usage() counter += 1 self.shutdown() @@ -117,15 +118,14 @@ def do_self_test(): ) raise DbInterfaceError('Schema mismatch') - -def _check_memory_usage(): - memory_usage = psutil.virtual_memory().percent - if memory_usage > 95.0: # noqa: PLR2004 - logging.critical(f'System memory is critically low: {memory_usage}%') - elif memory_usage > 80.0: # noqa: PLR2004 - logging.warning(f'System memory is running low: {memory_usage}%') - else: - logging.info(f'System memory usage: {memory_usage}%') + def _check_resource_usage(self): + memory_usage = psutil.virtual_memory().percent + if memory_usage > 95.0: # noqa: PLR2004 + logging.critical(f'System memory is critically low: {memory_usage}%') + elif memory_usage > 80.0: # noqa: PLR2004 + logging.warning(f'System memory is running low: {memory_usage}%') + else: + logging.info(f'System memory usage: {memory_usage}%; open file count: {self.main_proc.num_fds()}') def _is_main_process() -> bool: From 6b0324f09b584a10720ad948a5a105b47b124593 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Stucke?= Date: Fri, 5 Jul 2024 16:28:15 +0200 Subject: [PATCH 3/5] backend: better handling of too many open files errors --- src/scheduler/unpacking_scheduler.py | 8 +++++--- src/start_fact_backend.py | 10 ++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/scheduler/unpacking_scheduler.py b/src/scheduler/unpacking_scheduler.py index 006d3f4d9..b0279b1b5 100644 --- a/src/scheduler/unpacking_scheduler.py +++ b/src/scheduler/unpacking_scheduler.py @@ -107,7 +107,8 @@ def shutdown(self): ) self.stop_containers() self._clean_tmp_dirs() - self.manager.shutdown() + if self.manager: + self.manager.shutdown() logging.info('Unpacking scheduler offline') def _clean_tmp_dirs(self): @@ -138,8 +139,9 @@ def create_containers(self): self.worker_tmp_dirs.append(tmp_dir) def stop_containers(self): - with ThreadPoolExecutor(max_workers=len(self.workers)) as pool: - pool.map(lambda container: container.stop(), self.workers) + if self.workers: + with ThreadPoolExecutor(max_workers=len(self.workers)) as pool: + pool.map(lambda container: container.stop(), self.workers) def extraction_loop(self): logging.debug(f'Starting unpacking scheduler loop (pid={os.getpid()})') diff --git a/src/start_fact_backend.py b/src/start_fact_backend.py index 460e047a5..b8b49232a 100755 --- a/src/start_fact_backend.py +++ b/src/start_fact_backend.py @@ -127,5 +127,11 @@ def _check_ulimit(): if __name__ == '__main__': - FactBackend().main() - sys.exit(0) + backend = FactBackend() + try: + backend.main() + sys.exit(0) + except OSError as error: + logging.exception(f'Exception during start: {error}') + backend.shutdown() + sys.exit(1) From 4f248c18cce4539f2bfa72c1d24a2344a292ae4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Stucke?= Date: Mon, 15 Jul 2024 09:31:11 +0200 Subject: [PATCH 4/5] start_backend: added docstring to check_ulimit function --- src/start_fact_backend.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/start_fact_backend.py b/src/start_fact_backend.py index b8b49232a..a35b03cd5 100755 --- a/src/start_fact_backend.py +++ b/src/start_fact_backend.py @@ -115,6 +115,13 @@ def _exception_occurred(self): def _check_ulimit(): + """ + Each process has a hard limit and a soft limit for the maximum number of files opened at the same time. Since + FACT makes extensive use of multiprocessing features, it uses up a lot of those file descriptors and if we run + out, this raises an OSError. To mitigate this, we try to increase the soft limit and print a warning if the + hard limit is low. With the default configuration, FACT uses about 560 file descriptors (and potentially many + more if you crank up the worker counts). + """ soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) if hard_limit < ULIMIT_MIN: logging.warning( From 278dbcc8b02e8ee24de99c8d3ffe0864af7a8b1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Stucke?= Date: Tue, 16 Jul 2024 13:54:01 +0200 Subject: [PATCH 5/5] start_backend: extended docstring of check_ulimit function --- src/start_fact_backend.py | 43 ++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/src/start_fact_backend.py b/src/start_fact_backend.py index a35b03cd5..f6893f7b8 100755 --- a/src/start_fact_backend.py +++ b/src/start_fact_backend.py @@ -116,11 +116,44 @@ def _exception_occurred(self): def _check_ulimit(): """ - Each process has a hard limit and a soft limit for the maximum number of files opened at the same time. Since - FACT makes extensive use of multiprocessing features, it uses up a lot of those file descriptors and if we run - out, this raises an OSError. To mitigate this, we try to increase the soft limit and print a warning if the - hard limit is low. With the default configuration, FACT uses about 560 file descriptors (and potentially many - more if you crank up the worker counts). + 2024-07-16 - the numbers are prone to change over time + + Each process has a hard limit and a soft limit for the maximum number of file descriptors (FDs) opened at the same + time. Since FACT makes extensive use of multiprocessing features, it uses up a lot of those FDs and if we run out, + this raises an OSError. To mitigate this, we try to increase the soft limit and print a warning if the hard limit + is low. With the default configuration, FACT uses 556 FDs (and potentially many more if you crank up the worker + counts). + + The FD number is distributed among the individual backend components as follows: + + | component | init | start | sum | + | ---------------------- | ---- | ----- | --- | + | fact_base | 7 | - | 7 | + | unpacking_lock_manager | 2 | - | 2 | + | analysis_service | 200 | 294 | 494 | + | unpacking_service | 2 | 20 | 22 | + | compare_service | 3 | 4 | 7 | + | intercom | - | 24 | 24 | + | total | | | 556 | + + Most of this stems from the analysis_service. The analysis service in turn looks like this: + + | component | init | start | sum | + | ------------------------ | ---- | ----- | --- | + | plugins | 196 | 268 | | + | process queue | 2 | - | | + | AnalysisStatus | 2 | 2 | | + | AnalysisTaskScheduler | - | - | | + | FSOrganizer | - | - | | + | BackendDbInterface | - | - | | + | scheduler processes (4x) | - | 16 | | + | collector processes (2x) | - | 8 | | + | total | 200 | 294 | 494 | + + The 29 plugins are the main source of used FDs. Many FDs are used during initialization. The main reason for this + are input and output queues. Each queue contributes two FDs. In addition to that, there are the manager processes + for passing data between processes which also consume two FDs. Then there are some more multiprocessing objects + (Values, Arrays, etc.) that add some more. Even more are used when the worker processes are started. """ soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) if hard_limit < ULIMIT_MIN: