Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/ray-project/ray into env_…
Browse files Browse the repository at this point in the history
…runner_support_connectors_04_learner_api_changes
  • Loading branch information
sven1977 committed Jan 10, 2024
2 parents b769c05 + 76c30e3 commit f5ffe83
Show file tree
Hide file tree
Showing 50 changed files with 732 additions and 650 deletions.
25 changes: 23 additions & 2 deletions .buildkite/core.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ steps:
commands:
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/dag/... python/ray/autoscaler/v2/... core
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,container,manual
--except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,container,manual

- label: ":ray: core: redis tests"
tags: python
Expand All @@ -41,7 +41,7 @@ steps:
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... //python/ray/dag/... python/ray/autoscaler/v2/... core
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--test-env=TEST_EXTERNAL_REDIS=1
--except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,container,manual
--except-tags debug_tests,asan_tests,post_wheel_build,ha_integration,mem_pressure,tmpfs,container,manual

- label: ":ray: core: :windows: python tests"
tags: python
Expand Down Expand Up @@ -75,6 +75,27 @@ steps:
--test_tag_filters=mem_pressure -- //python/ray/tests/...
job_env: corebuild

- label: ":ray: core: out of disk tests"
tags:
- python
- oss
instance_type: small
commands:
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core
--only-tags=tmpfs --tmp-filesystem=tmpfs
job_env: corebuild

- label: ":ray: core: out of disk redis tests"
tags:
- python
- oss
instance_type: small
commands:
- bazel run //ci/ray_ci:test_in_docker -- //python/ray/tests/... core
--test-env=TEST_EXTERNAL_REDIS=1
--only-tags=tmpfs --tmp-filesystem=tmpfs
job_env: corebuild

- label: ":ray: core: workflow tests"
tags:
- python
Expand Down
8 changes: 5 additions & 3 deletions ci/build/upload_build_info.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# Cause the script to exit if a single command fails.
set -ex

BAZEL_LOG_DIR=${1:-"/tmp/bazel_event_logs"}

readonly PIPELINE_POSTMERGE="0189e759-8c96-4302-b6b5-b4274406bf89"
readonly PIPELINE_CIV1_BRANCH="0183465b-c6fb-479b-8577-4cfd743b545d"
if [[
Expand All @@ -23,10 +25,10 @@ RAY_DIR=$(cd "${ROOT_DIR}/../../"; pwd)

cd "${RAY_DIR}"

mkdir -p /tmp/bazel_event_logs
mkdir -p "$BAZEL_LOG_DIR"

./ci/build/get_build_info.py > /tmp/bazel_event_logs/metadata.json
./ci/build/get_build_info.py > "$BAZEL_LOG_DIR"/metadata.json

# Keep cryptography/openssl in sync with `requirements/test-requirements.txt`
pip install -q -c "${RAY_DIR}/python/requirements.txt" docker aws_requests_auth boto3 cryptography==38.0.1 PyOpenSSL==23.0.0
python .buildkite/copy_files.py --destination logs --path /tmp/bazel_event_logs
python .buildkite/copy_files.py --destination logs --path "$BAZEL_LOG_DIR"
3 changes: 3 additions & 0 deletions ci/ray_ci/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def get_run_command(
script: List[str],
network: Optional[str] = None,
gpu_ids: Optional[List[int]] = None,
volumes: Optional[List[str]] = None,
) -> List[str]:
"""
Get docker run command
Expand All @@ -89,6 +90,8 @@ def get_run_command(
command += ["--env", env]
if network:
command += ["--network", network]
for volume in volumes or []:
command += ["--volume", volume]
return (
command
+ self.get_run_command_extra_args(gpu_ids)
Expand Down
11 changes: 11 additions & 0 deletions ci/ray_ci/linux_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,16 @@ def __init__(
docker_tag: str,
volumes: Optional[List[str]] = None,
envs: Optional[List[str]] = None,
tmp_filesystem: Optional[str] = None,
) -> None:
super().__init__(docker_tag, envs)
self.volumes = volumes or []

if tmp_filesystem is not None:
if tmp_filesystem != "tmpfs":
raise ValueError("Only tmpfs is supported for tmp filesystem")
self.tmp_filesystem = tmp_filesystem

def install_ray(self, build_type: Optional[str] = None) -> List[str]:
env = os.environ.copy()
env["DOCKER_BUILDKIT"] = "1"
Expand Down Expand Up @@ -58,6 +64,11 @@ def get_run_command_extra_args(
"--add-host",
"rayci.localhost:host-gateway",
]
if self.tmp_filesystem:
extra_args += [
"--mount",
f"type={self.tmp_filesystem},destination=/tmp",
]
for volume in self.volumes:
extra_args += ["--volume", volume]
for cap in _DOCKER_CAP_ADD:
Expand Down
3 changes: 3 additions & 0 deletions ci/ray_ci/linux_tester_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(
shard_ids: Optional[List[int]] = None,
skip_ray_installation: bool = False,
build_type: Optional[str] = None,
tmp_filesystem: Optional[str] = None,
) -> None:
LinuxContainer.__init__(
self,
Expand All @@ -25,11 +26,13 @@ def __init__(
f"{os.environ.get('RAYCI_CHECKOUT_DIR')}:/ray-mount",
"/var/run/docker.sock:/var/run/docker.sock",
],
tmp_filesystem=tmp_filesystem,
)
TesterContainer.__init__(
self,
shard_count,
gpus,
bazel_log_dir="/tmp/bazel_event_logs",
network=network,
test_envs=test_envs,
shard_ids=shard_ids,
Expand Down
6 changes: 6 additions & 0 deletions ci/ray_ci/test_linux_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,11 @@ def test_get_run_command() -> None:
assert "/bin/bash -iecuo pipefail -- hi\nhello" in command


def test_get_run_command_tmpfs() -> None:
container = LinuxContainer("test", tmp_filesystem="tmpfs")
command = " ".join(container.get_run_command(["hi", "hello"]))
assert "--mount type=tmpfs,destination=/tmp" in command


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
30 changes: 27 additions & 3 deletions ci/ray_ci/test_linux_tester_container.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import sys
import pytest
import tempfile
from unittest import mock
from typing import List, Optional

Expand Down Expand Up @@ -49,17 +51,20 @@ def _mock_popen(input: List[str]) -> None:
):
LinuxTesterContainer(
"team", network="host", build_type="debug", test_envs=["ENV_01", "ENV_02"]
)._run_tests_in_docker(["t1", "t2"], [0, 1], ["v=k"], "flag")
)._run_tests_in_docker(["t1", "t2"], [0, 1], "/tmp", ["v=k"], "flag")
input_str = inputs[-1]
assert "--env ENV_01 --env ENV_02 --env BUILDKITE" in input_str
assert "--network host" in input_str
assert '--gpus "device=0,1"' in input_str
assert "--volume /tmp:/tmp/bazel_event_logs" in input_str
assert (
"bazel test --jobs=1 --config=ci $(./ci/run/bazel_export_options) "
"--config=ci-debug --test_env v=k --test_arg flag t1 t2" in input_str
)

LinuxTesterContainer("team")._run_tests_in_docker(["t1", "t2"], [], ["v=k"])
LinuxTesterContainer("team")._run_tests_in_docker(
["t1", "t2"], [], "/tmp", ["v=k"]
)
input_str = inputs[-1]
assert "--env BUILDKITE_BUILD_URL" in input_str
assert "--gpus" not in input_str
Expand Down Expand Up @@ -126,6 +131,7 @@ def test_run_tests() -> None:
def _mock_run_tests_in_docker(
test_targets: List[str],
gpu_ids: List[int],
bazel_log_dir: str,
test_envs: List[str],
test_arg: Optional[str] = None,
) -> MockPopen:
Expand All @@ -134,7 +140,13 @@ def _mock_run_tests_in_docker(
def _mock_shard_tests(tests: List[str], workers: int, worker_id: int) -> List[str]:
return chunk_into_n(tests, workers)[worker_id]

with mock.patch(
with tempfile.TemporaryDirectory() as tmpdir, mock.patch(
"ci.ray_ci.linux_tester_container.LinuxTesterContainer.get_artifact_mount",
return_value=("/tmp/artifacts", tmpdir),
), mock.patch(
"ci.ray_ci.linux_tester_container.LinuxTesterContainer._persist_test_results",
return_value=None,
), mock.patch(
"ci.ray_ci.linux_tester_container.LinuxTesterContainer._run_tests_in_docker",
side_effect=_mock_run_tests_in_docker,
), mock.patch(
Expand All @@ -153,5 +165,17 @@ def _mock_shard_tests(tests: List[str], workers: int, worker_id: int) -> List[st
assert not container.run_tests(["bad_test"], [])


def test_create_bazel_log_mount() -> None:
with tempfile.TemporaryDirectory() as tmpdir, mock.patch(
"ci.ray_ci.linux_tester_container.LinuxTesterContainer.get_artifact_mount",
return_value=("/tmp/artifacts", tmpdir),
):
container = LinuxTesterContainer("team", skip_ray_installation=True)
assert container._create_bazel_log_mount("w00t") == (
"/tmp/artifacts/w00t",
os.path.join(tmpdir, "w00t"),
)


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
1 change: 1 addition & 0 deletions ci/ray_ci/test_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def test_get_container() -> None:
parallelism_per_worker=2,
network=None,
gpus=0,
tmp_filesystem=None,
)
assert isinstance(container, LinuxTesterContainer)
assert container.docker_tag == "corebuild"
Expand Down
11 changes: 10 additions & 1 deletion ci/ray_ci/tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,11 @@
type=click.Choice(["linux", "windows"]),
help=("Operating system to run tests on"),
)
@click.option(
"--tmp-filesystem",
type=str,
help=("Filesystem to use for /tmp"),
)
def main(
targets: List[str],
team: str,
Expand All @@ -165,6 +170,7 @@ def main(
test_arg: Optional[str],
build_name: Optional[str],
build_type: Optional[str],
tmp_filesystem: Optional[str],
) -> None:
if not bazel_workspace_dir:
raise Exception("Please use `bazelisk run //ci/ray_ci`")
Expand All @@ -182,7 +188,8 @@ def main(
worker_id,
parallelism_per_worker,
gpus,
network,
network=network,
tmp_filesystem=tmp_filesystem,
test_env=list(test_env),
build_name=build_name,
build_type=build_type,
Expand Down Expand Up @@ -218,6 +225,7 @@ def _get_container(
parallelism_per_worker: int,
gpus: int,
network: Optional[str],
tmp_filesystem: Optional[str] = None,
test_env: Optional[List[str]] = None,
build_name: Optional[str] = None,
build_type: Optional[str] = None,
Expand All @@ -237,6 +245,7 @@ def _get_container(
network=network,
skip_ray_installation=skip_ray_installation,
build_type=build_type,
tmp_filesystem=tmp_filesystem,
)

if operating_system == "windows":
Expand Down
52 changes: 49 additions & 3 deletions ci/ray_ci/tester_container.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import os
import platform
import random
import shutil
import string
import subprocess
from typing import List, Optional
from typing import List, Tuple, Optional

from ci.ray_ci.utils import shard_tests, chunk_into_n
from ci.ray_ci.utils import logger
Expand All @@ -16,6 +20,7 @@ def __init__(
self,
shard_count: int = 1,
gpus: int = 0,
bazel_log_dir: str = "/tmp",
network: Optional[str] = None,
test_envs: Optional[List[str]] = None,
shard_ids: Optional[List[int]] = None,
Expand All @@ -28,6 +33,7 @@ def __init__(
used to run tests in a distributed fashion.
:param shard_ids: The list of shard ids to run. If none, run no shards.
"""
self.bazel_log_dir = bazel_log_dir
self.shard_count = shard_count
self.shard_ids = shard_ids or []
self.test_envs = test_envs or []
Expand All @@ -41,6 +47,21 @@ def __init__(
if not skip_ray_installation:
self.install_ray(build_type)

def _create_bazel_log_mount(self, tmp_dir: Optional[str] = None) -> Tuple[str, str]:
"""
Create a temporary directory in the current container to store bazel event logs
produced by the test runs. We do this by using the artifact mount directory from
the host machine as a shared directory between all containers.
"""
tmp_dir = tmp_dir or "".join(
random.choice(string.ascii_lowercase) for _ in range(5)
)
artifact_host, artifact_container = self.get_artifact_mount()
bazel_log_dir_host = os.path.join(artifact_host, tmp_dir)
bazel_log_dir_container = os.path.join(artifact_container, tmp_dir)
os.mkdir(bazel_log_dir_container)
return (bazel_log_dir_host, bazel_log_dir_container)

def run_tests(
self,
test_targets: List[str],
Expand All @@ -65,23 +86,47 @@ def run_tests(

# divide gpus evenly among chunks
gpu_ids = chunk_into_n(list(range(self.gpus)), len(chunks))
bazel_log_dir_host, bazel_log_dir_container = self._create_bazel_log_mount()
runs = [
self._run_tests_in_docker(chunks[i], gpu_ids[i], self.test_envs, test_arg)
self._run_tests_in_docker(
chunks[i], gpu_ids[i], bazel_log_dir_host, self.test_envs, test_arg
)
for i in range(len(chunks))
]
exits = [run.wait() for run in runs]
self._persist_test_results(bazel_log_dir_container)
self._cleanup_bazel_log_mount(bazel_log_dir_container)

return all(exit == 0 for exit in exits)

def _persist_test_results(self, bazel_log_dir: str) -> None:
# TODO(can): implement the logic to persist test results to a database
self._upload_build_info(bazel_log_dir)

def _upload_build_info(self, bazel_log_dir) -> None:
logger.info("Uploading bazel test logs")
subprocess.check_call(
[
"bash",
"ci/build/upload_build_info.sh",
bazel_log_dir,
]
)

def _cleanup_bazel_log_mount(self, bazel_log_dir: str) -> None:
shutil.rmtree(bazel_log_dir)

def _run_tests_in_docker(
self,
test_targets: List[str],
gpu_ids: List[int],
bazel_log_dir_host: str,
test_envs: List[str],
test_arg: Optional[str] = None,
) -> subprocess.Popen:
logger.info("Running tests: %s", test_targets)
commands = [
"cleanup() { ./ci/build/upload_build_info.sh; }",
f'cleanup() {{ chmod -R a+r "{self.bazel_log_dir}"; }}',
"trap cleanup EXIT",
]
if platform.system() == "Windows":
Expand Down Expand Up @@ -119,5 +164,6 @@ def _run_tests_in_docker(
commands,
network=self.network,
gpu_ids=gpu_ids,
volumes=[f"{bazel_log_dir_host}:{self.bazel_log_dir}"],
)
)
1 change: 1 addition & 0 deletions ci/ray_ci/windows_tester_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def __init__(
self,
shard_count,
gpus=0, # We don't support GPU tests on Windows yet.
bazel_log_dir="C:\\msys64\\tmp\\bazel_event_logs",
network=network,
test_envs=test_envs,
shard_ids=shard_ids,
Expand Down
Loading

0 comments on commit f5ffe83

Please sign in to comment.