Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(test suite): use cgroups to detect if a test leaks processes #6470

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ jobs:
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --cgroupns=private --security-opt umask=/sys/fs/cgroup
koivunej marked this conversation as resolved.
Show resolved Hide resolved
strategy:
fail-fast: false
matrix:
Expand All @@ -456,6 +456,9 @@ jobs:
submodules: true
fetch-depth: 1

- name: Setup cgroup for use by test suite
run: sudo bash -x /setup_neon_testsuite_cgroup.bash

- name: Pytest regression tests
uses: ./.github/actions/run-python-test-set
with:
Expand All @@ -472,6 +475,7 @@ jobs:
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
NEON_TEST_SUITE_USE_CGROUPS: /sys/fs/cgroup/neon_testsuite

- name: Merge and upload coverage data
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
Expand Down
15 changes: 12 additions & 3 deletions Dockerfile.buildtools
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
FROM debian:bullseye-slim

# Add nonroot user
RUN useradd -ms /bin/bash nonroot -b /home
SHELL ["/bin/bash", "-c"]

# System deps
RUN set -e \
Expand Down Expand Up @@ -43,13 +40,25 @@ RUN set -e \
openssh-client \
parallel \
pkg-config \
sudo \
unzip \
wget \
xz-utils \
zlib1g-dev \
zstd \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

# Add nonroot user
RUN useradd -ms /bin/bash nonroot -b /home
SHELL ["/bin/bash", "-c"]
RUN echo "#!/usr/bin/env bash \
set -exuo pipefail \
mkdir /sys/fs/cgroup/neon_testsuite \
chown -R nonroot:nonroot /sys/fs/cgroup/neon_testsuite \
echo SUCCESS: cgroup set up for user nonroot at /sys/fs/cgroup/neon_testsuite \
" > /setup_neon_testsuite_cgroup.bash && chmod +x /setup_neon_testsuite_cgroup.bash
RUN echo "ALL ALL = (ALL) NOPASSWD: /setup_neon_testsuite_cgroup.bash" >> /etc/sudoers

# protobuf-compiler (protoc)
ENV PROTOC_VERSION 25.1
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
Expand Down
19 changes: 17 additions & 2 deletions control_plane/src/endpoint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ impl Endpoint {
}

fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
// TODO use background_process::stop_process instead
// TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
let pid = nix::unistd::Pid::from_raw(pid as i32);
Expand Down Expand Up @@ -583,9 +583,21 @@ impl Endpoint {
}

let child = cmd.spawn()?;
// set up a scopeguard to kill & wait for the child in case we panic or bail below
let child = scopeguard::guard(child, |mut child| {
println!("SIGKILL & wait the started process");
(|| {
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned
child.kill().context("SIGKILL child")?;
child.wait().context("wait() for child process")?;
anyhow::Ok(())
})()
.with_context(|| format!("scopeguard kill&wait child {child:?}"))
.unwrap();
});

// Write down the pid so we can wait for it when we want to stop
// TODO use background_process::start_process instead
// TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
let pid = child.id();
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
std::fs::write(pidfile_path, pid.to_string())?;
Expand Down Expand Up @@ -634,6 +646,9 @@ impl Endpoint {
std::thread::sleep(ATTEMPT_INTERVAL);
}

// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
drop(scopeguard::ScopeGuard::into_inner(child));

Ok(())
}

Expand Down
101 changes: 101 additions & 0 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

import abc
import asyncio
import errno
import filecmp
import json
import os
import re
import shutil
import signal
import subprocess
import tempfile
import textwrap
Expand Down Expand Up @@ -173,6 +175,35 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Ite
yield versioned_dir


@pytest.fixture(scope="function")
def test_cgroup_dir(request: FixtureRequest) -> Optional[Path]:
## Use like so:
# systemd-run --user --shell
# cat /proc/self/cgroup
# # will look like so: 0::/user.slice/user-1000.slice/user@1000.service/app.slice/run-u5.service
# env \
# NEON_TEST_SUITE_USE_CGROUPS=/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service/app.slice/run-u5.service \
# BUILD_TYPE=debug \
# DEFAULT_PG_VERSION=15 \
# ./scripts/pytest

var = os.getenv("NEON_TEST_SUITE_USE_CGROUPS")
if var is None:
return None
root = Path(var)
assert root.is_dir()
assert (
"cgroup2fs"
== subprocess.check_output(
["stat", "--file-system", "--format=%T", root],
text=True,
).strip()
)
test_cgroup_dir: Path = get_test_cgroup_dir(request, root)
test_cgroup_dir.mkdir(exist_ok=False, parents=False)
return test_cgroup_dir


def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]:
"""Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.

Expand Down Expand Up @@ -447,6 +478,7 @@ def __init__(
initial_tenant: Optional[TenantId] = None,
initial_timeline: Optional[TimelineId] = None,
pageserver_virtual_file_io_engine: Optional[str] = None,
test_cgroup_dir: Optional[Path] = None,
):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
Expand Down Expand Up @@ -483,6 +515,7 @@ def __init__(
self.top_output_dir = top_output_dir

self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
self.test_cgroup_dir = test_cgroup_dir

assert test_name.startswith(
"test_"
Expand Down Expand Up @@ -876,6 +909,15 @@ def cleanup_remote_storage(self):
x.do_cleanup()

def __enter__(self) -> "NeonEnvBuilder":
if self.test_cgroup_dir is not None:
log.info(f"putting test runner into cgroup {self.test_cgroup_dir}")
procs_file = self.test_cgroup_dir / "cgroup.procs"
# TODO: auto-cleanup the cgroup, share code with __exit__
pids = procs_file.read_text().split()
assert pids == []
mypid = os.getpid()
with open(procs_file, "a") as f:
f.write(f" {mypid}\n")
return self

def __exit__(
Expand Down Expand Up @@ -929,6 +971,57 @@ def __exit__(
if cleanup_error is not None:
cleanup_error = e

if self.test_cgroup_dir is not None:
# TODO: ensure that we're the only python thread running;
# otherwise, if a test leaks a thread,
# our checking here would race with that other thread.
# => https://github.com/neondatabase/neon/issues/6486
log.info(f"check test runner cgroup for leaked processes: {self.test_cgroup_dir}")
procs_file = self.test_cgroup_dir / "cgroup.procs"
mypid = os.getpid()
# move ourselves out of cgroup
with open(self.test_cgroup_dir.parent / "cgroup.procs", "a") as f:
f.write(f"{mypid}")
# now freeze the test cgroup, so we can race-free list & SIGKILL the processes
freeze_file = self.test_cgroup_dir / "cgroup.freeze"
freeze_file.write_text("1")
koivunej marked this conversation as resolved.
Show resolved Hide resolved
# inspect remaining pids
pids = [int(pid) for pid in procs_file.read_text().split()]
had_leaked_pids = len(pids) > 0
for pid in pids:
# dump info
try:
args_file = Path(f"/proc/{pid}/cmdline").read_bytes()
problame marked this conversation as resolved.
Show resolved Hide resolved
args_parsed = [
bytes.decode("utf-8") for bytes in args_file.split(b"\x00")
] # NULL-byte separated
args = f"{args_parsed}"
except Exception as e:
args = f"cmdline unavailable, {type(e)}, {e}"
log.warning(f"SIGKILLing leaked process: {pid}: {args}")
# send SIGKILL to the frozen process
os.kill(pid, signal.SIGKILL)
# thaw the cgroup
freeze_file.write_text("0")
# wait for processes to exit
while True:
try:
self.test_cgroup_dir.rmdir()
break
except OSError as e:
if e.errno != errno.EBUSY:
raise
pids = [int(pid) for pid in procs_file.read_text().split()]
if len(pids) == 0:
break
log.warning(f"waiting for pids to exit: {pids}")
time.sleep(0.1)
log.info("all pids exited and test cgroup has been deleted")
cleanup_cmd = (
f"for pid in $(cat '{procs_file}'); do; kill -9 $pid; done; rmdir {procs_file}"
)
assert not had_leaked_pids, f"had pids in test cgroup after NeonEnvBuilder teardown, see logs for details & cleanup help\ncleanup using bash command: {cleanup_cmd}\n"


class NeonEnv:
"""
Expand Down Expand Up @@ -1200,6 +1293,7 @@ def _shared_simple_env(
pg_distrib_dir: Path,
pg_version: PgVersion,
pageserver_virtual_file_io_engine: str,
test_cgroup_dir: Optional[Path],
) -> Iterator[NeonEnv]:
"""
# Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
Expand Down Expand Up @@ -1230,6 +1324,7 @@ def _shared_simple_env(
test_name=request.node.name,
test_output_dir=test_output_dir,
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
test_cgroup_dir=test_cgroup_dir,
) as builder:
env = builder.init_start()

Expand Down Expand Up @@ -1269,6 +1364,7 @@ def neon_env_builder(
test_overlay_dir: Path,
top_output_dir: Path,
pageserver_virtual_file_io_engine: str,
test_cgroup_dir: Optional[Path],
) -> Iterator[NeonEnvBuilder]:
"""
Fixture to create a Neon environment for test.
Expand Down Expand Up @@ -1302,6 +1398,7 @@ def neon_env_builder(
test_name=request.node.name,
test_output_dir=test_output_dir,
test_overlay_dir=test_overlay_dir,
test_cgroup_dir=test_cgroup_dir,
) as builder:
yield builder

Expand Down Expand Up @@ -3636,6 +3733,10 @@ def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
return get_test_output_dir(request, top_output_dir) / "repo"


def get_test_cgroup_dir(request: FixtureRequest, top_cgroup_dir: Path) -> Path:
return _get_test_dir(request, top_cgroup_dir, "")


def pytest_addoption(parser: Parser):
parser.addoption(
"--preserve-database-files",
Expand Down
3 changes: 2 additions & 1 deletion test_runner/regress/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from fixtures.utils import subprocess_capture


def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
# fixture order matters here: neon_env_builder leaked process assertions
def test_import_from_vanilla(test_output_dir, pg_bin, neon_env_builder, vanilla_pg):
# Put data in vanilla pg
vanilla_pg.start()
vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
Expand Down
2 changes: 2 additions & 0 deletions test_runner/regress/test_neon_local_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail(
env.neon_cli.endpoint_stop("ep1")
# ep1 is stopped so create ep2 will succeed
env.neon_cli.endpoint_start("ep2")
# cleanup
env.neon_cli.endpoint_stop("ep2")
Loading