Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backend start script: check ulimit and increase soft limit if possible #1234

Merged
merged 5 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions src/fact_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class FactBase:

def __init__(self):
self.run = True
self.main_proc = psutil.Process()

self.args = setup_argparser(self.PROGRAM_NAME, self.PROGRAM_DESCRIPTION)
config.load(self.args.config_file)
Expand Down Expand Up @@ -104,7 +105,7 @@ def main(self):
if self.args.testing:
break
if not counter % 12: # only check every minute
_check_memory_usage()
self._check_resource_usage()
counter += 1
self.shutdown()

Expand All @@ -117,15 +118,14 @@ def do_self_test():
)
raise DbInterfaceError('Schema mismatch')


def _check_memory_usage():
memory_usage = psutil.virtual_memory().percent
if memory_usage > 95.0: # noqa: PLR2004
logging.critical(f'System memory is critically low: {memory_usage}%')
elif memory_usage > 80.0: # noqa: PLR2004
logging.warning(f'System memory is running low: {memory_usage}%')
else:
logging.info(f'System memory usage: {memory_usage}%')
def _check_resource_usage(self):
memory_usage = psutil.virtual_memory().percent
if memory_usage > 95.0: # noqa: PLR2004
logging.critical(f'System memory is critically low: {memory_usage}%')
elif memory_usage > 80.0: # noqa: PLR2004
logging.warning(f'System memory is running low: {memory_usage}%')
else:
logging.info(f'System memory usage: {memory_usage}%; open file count: {self.main_proc.num_fds()}')


def _is_main_process() -> bool:
Expand Down
8 changes: 5 additions & 3 deletions src/scheduler/unpacking_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ def shutdown(self):
)
self.stop_containers()
self._clean_tmp_dirs()
self.manager.shutdown()
if self.manager:
self.manager.shutdown()
logging.info('Unpacking scheduler offline')

def _clean_tmp_dirs(self):
Expand Down Expand Up @@ -138,8 +139,9 @@ def create_containers(self):
self.worker_tmp_dirs.append(tmp_dir)

def stop_containers(self):
with ThreadPoolExecutor(max_workers=len(self.workers)) as pool:
pool.map(lambda container: container.stop(), self.workers)
if self.workers:
with ThreadPoolExecutor(max_workers=len(self.workers)) as pool:
pool.map(lambda container: container.stop(), self.workers)

def extraction_loop(self):
logging.debug(f'Starting unpacking scheduler loop (pid={os.getpid()})')
Expand Down
66 changes: 64 additions & 2 deletions src/start_fact_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import grp
import logging
import os
import resource
import sys
from pathlib import Path

Expand All @@ -37,6 +38,8 @@
from scheduler.unpacking_scheduler import UnpackingScheduler
from storage.unpacking_locks import UnpackingLockManager

ULIMIT_MIN = 1_024


class FactBackend(FactBase):
PROGRAM_NAME = 'FACT Backend'
Expand All @@ -47,6 +50,7 @@ def __init__(self):
super().__init__()
self.unpacking_lock_manager = UnpackingLockManager()
self._create_docker_base_dir()
_check_ulimit()

try:
self.analysis_service = AnalysisScheduler(unpacking_locks=self.unpacking_lock_manager)
Expand Down Expand Up @@ -110,6 +114,64 @@ def _exception_occurred(self):
)


def _check_ulimit():
"""
2024-07-16 - the numbers are prone to change over time

Each process has a hard limit and a soft limit for the maximum number of file descriptors (FDs) opened at the same
time. Since FACT makes extensive use of multiprocessing features, it uses up a lot of those FDs and if we run out,
this raises an OSError. To mitigate this, we try to increase the soft limit and print a warning if the hard limit
is low. With the default configuration, FACT uses 556 FDs (and potentially many more if you crank up the worker
counts).

The FD number is distributed among the individual backend components as follows:

| component | init | start | sum |
| ---------------------- | ---- | ----- | --- |
| fact_base | 7 | - | 7 |
| unpacking_lock_manager | 2 | - | 2 |
| analysis_service | 200 | 294 | 494 |
| unpacking_service | 2 | 20 | 22 |
| compare_service | 3 | 4 | 7 |
| intercom | - | 24 | 24 |
| total | | | 556 |

Most of this stems from the analysis_service. The analysis service in turn looks like this:

| component | init | start | sum |
| ------------------------ | ---- | ----- | --- |
| plugins | 196 | 268 | |
| process queue | 2 | - | |
| AnalysisStatus | 2 | 2 | |
| AnalysisTaskScheduler | - | - | |
| FSOrganizer | - | - | |
| BackendDbInterface | - | - | |
| scheduler processes (4x) | - | 16 | |
| collector processes (2x) | - | 8 | |
| total | 200 | 294 | 494 |

The 29 plugins are the main source of used FDs. Many FDs are used during initialization. The main reason for this
are input and output queues. Each queue contributes two FDs. In addition to that, there are the manager processes
for passing data between processes which also consume two FDs. Then there are some more multiprocessing objects
(Values, Arrays, etc.) that add some more. Even more are used when the worker processes are started.
"""
soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
if hard_limit < ULIMIT_MIN:
logging.warning(
'The open file limit appears to be low. This could lead to "too many open files" errors. Please increase '
'the open file hard limit for the process that runs FACT.'
)
if soft_limit < hard_limit:
# we are only allowed to increase the soft limit and not the hard limit
resource.setrlimit(resource.RLIMIT_NOFILE, (hard_limit, hard_limit))


if __name__ == '__main__':
FactBackend().main()
sys.exit(0)
backend = FactBackend()
try:
backend.main()
sys.exit(0)
except OSError as error:
logging.exception(f'Exception during start: {error}')
backend.shutdown()
sys.exit(1)