Skip to content

Commit

Permalink
Merge branch 'master' into deflak-test-metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
GeneDer authored Sep 30, 2024
2 parents 869594b + afbb719 commit 0838d2a
Show file tree
Hide file tree
Showing 373 changed files with 6,172 additions and 10,332 deletions.
4 changes: 4 additions & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -2525,6 +2525,7 @@ filegroup(
"//src/ray/protobuf:runtime_env_agent_py_proto",
"//src/ray/protobuf:runtime_env_common_py_proto",
"//src/ray/protobuf:usage_py_proto",
"//src/ray/protobuf:export_event_py_proto",
],
)

Expand Down Expand Up @@ -2618,6 +2619,9 @@ genrule(
# of experimental.
autoscale_files=(`ls python/ray/core/generated/instance_manager_pb2*.py`)
sed -i -E 's/from ..experimental/from ./' "$${autoscale_files[@]}"
# Help the generated export api files to have the correct module
export_api_files=(`ls python/ray/core/generated/export*_pb2*.py`)
sed -i -E 's/from ..export_api/from ./' "$${export_api_files[@]}"
echo "$${PWD}" > $@
""",
local = 1,
Expand Down
2 changes: 1 addition & 1 deletion ci/docker/base.gpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ RUN ln -s /usr/bin/clang-format-12 /usr/bin/clang-format && \
ln -s /usr/bin/clang-tidy-12 /usr/bin/clang-tidy && \
ln -s /usr/bin/clang-12 /usr/bin/clang

RUN curl -o- https://get.docker.com | sh
RUN curl -o- https://get.docker.com | sh -s -- --version 27.2

# System conf for tests
RUN locale -a
Expand Down
2 changes: 1 addition & 1 deletion ci/docker/base.test.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ ln -s /usr/bin/clang-12 /usr/bin/clang

EOF

RUN curl -o- https://get.docker.com | sh
RUN curl -o- https://get.docker.com | sh -s -- --version 27.2

# System conf for tests
RUN locale -a
Expand Down
5 changes: 1 addition & 4 deletions ci/docker/min.build.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,11 @@ elif [[ "${EXTRA_DEPENDENCY}" == "ml" ]]; then
elif [[ "${EXTRA_DEPENDENCY}" == "default" ]]; then
pip-compile -o min_requirements.txt python/setup.py --extra default
elif [[ "${EXTRA_DEPENDENCY}" == "serve" ]]; then
pip-compile -o min_requirements.txt python/setup.py --extra serve
pip-compile -o min_requirements.txt python/setup.py --extra serve-grpc
fi

if [[ -f min_requirements.txt ]]; then
pip install -r min_requirements.txt
fi

EOF



17 changes: 13 additions & 4 deletions ci/ray_ci/docker_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
]
GPU_PLATFORM = "cu12.1.1-cudnn8"

PYTHON_VERSIONS_RAY = ["3.9", "3.10", "3.11"]
PYTHON_VERSIONS_RAY_ML = ["3.9", "3.10"]
PYTHON_VERSIONS_RAY = ["3.9", "3.10", "3.11", "3.12"]
PYTHON_VERSIONS_RAY_ML = ["3.9", "3.10", "3.11"]
ARCHITECTURES_RAY = ["x86_64", "aarch64"]
ARCHITECTURES_RAY_ML = ["x86_64"]

Expand All @@ -46,6 +46,15 @@ def __init__(
upload: bool = False,
) -> None:
assert "RAYCI_CHECKOUT_DIR" in os.environ, "RAYCI_CHECKOUT_DIR not set"

assert python_version in PYTHON_VERSIONS_RAY
assert platform in PLATFORMS_RAY
assert architecture in ARCHITECTURES_RAY
if image_type == RayType.RAY_ML:
assert python_version in PYTHON_VERSIONS_RAY_ML
assert platform in PLATFORMS_RAY_ML
assert architecture in ARCHITECTURES_RAY_ML

rayci_checkout_dir = os.environ["RAYCI_CHECKOUT_DIR"]
self.python_version = python_version
self.platform = platform
Expand Down Expand Up @@ -122,13 +131,13 @@ def _get_image_tags(self, external: bool = False) -> List[str]:
versions = self._get_image_version_tags(external)

platforms = [self.get_platform_tag()]
if self.platform == "cpu" and self.image_type == "ray":
if self.platform == "cpu" and self.image_type == RayType.RAY:
# no tag is alias to cpu for ray image
platforms.append("")
elif self.platform == GPU_PLATFORM:
# gpu is alias to cu118 for ray image
platforms.append("-gpu")
if self.image_type == "ray-ml":
if self.image_type == RayType.RAY_ML:
# no tag is alias to gpu for ray-ml image
platforms.append("")

Expand Down
5 changes: 3 additions & 2 deletions ci/ray_ci/test_ray_docker_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from ci.ray_ci.builder_container import DEFAULT_PYTHON_VERSION
from ci.ray_ci.container import _DOCKER_ECR_REPO
from ci.ray_ci.docker_container import GPU_PLATFORM
from ci.ray_ci.ray_docker_container import RayDockerContainer
from ci.ray_ci.test_base import RayCITestBase
from ci.ray_ci.utils import RAY_VERSION
Expand Down Expand Up @@ -203,8 +204,8 @@ def test_canonical_tag(self) -> None:
container = RayDockerContainer(v, "cpu", "ray", "aarch64")
assert container._get_canonical_tag() == f"{sha}-{pv}-cpu-aarch64"

container = RayDockerContainer(v, "cu11.8.0-cudnn8", "ray-ml")
assert container._get_canonical_tag() == f"{sha}-{pv}-cu118"
container = RayDockerContainer(v, GPU_PLATFORM, "ray-ml")
assert container._get_canonical_tag() == f"{sha}-{pv}-cu121"

with mock.patch.dict(os.environ, {"BUILDKITE_BRANCH": "releases/1.0.0"}):
container = RayDockerContainer(v, "cpu", "ray")
Expand Down
1 change: 1 addition & 0 deletions doc/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ doctest(
"source/rllib/rllib-sample-collection.rst",
],
),
data = ["//rllib:cartpole-v1_large"],
tags = ["team:rllib"],
)

Expand Down
4 changes: 0 additions & 4 deletions doc/source/_includes/rllib/new_api_stack.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,4 @@
The Ray Team plans to transition algorithms, example scripts, and documentation to the new code base
thereby incrementally replacing the "old API stack" (e.g., ModelV2, Policy, RolloutWorker) throughout the subsequent minor releases leading up to Ray 3.0.

Note, however, that so far only PPO (single- and multi-agent) and SAC (single-agent only)
support the "new API stack" and continue to run by default with the old APIs.
You can continue to use the existing custom (old stack) classes.

:doc:`See here </rllib/rllib-new-api-stack>` for more details on how to use the new API stack.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
30 changes: 30 additions & 0 deletions doc/source/data/monitoring-your-workload.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,40 @@ Monitoring Your Workload

This section helps you debug and monitor the execution of your :class:`~ray.data.Dataset` by viewing the:

* :ref:`Ray Data progress bars <ray-data-progress-bars>`
* :ref:`Ray Data dashboard <ray-data-dashboard>`
* :ref:`Ray Data logs <ray-data-logs>`
* :ref:`Ray Data stats <ray-data-stats>`

.. _ray-data-progress-bars:

Ray Data progress bars
----------------------

When you execute a :class:`~ray.data.Dataset`, Ray Data displays a set of progress bars in the console. These progress bars show various execution and progress-related metrics, including the number of rows completed/remaining, resource usage, and task/actor status. See the annotated image for a breakdown of how to interpret the progress bar outputs:

.. image:: images/dataset-progress-bar.png
:align: center


Some additional notes on progress bars:

* The progress bars are updated every second; resource usage, metrics, and task/actor status may take up to 5 seconds to update.
* When the tasks section contains the label `[backpressure]`, it indicates that the operator is *backpressured*, meaning that the operator won't submit more tasks until the downstream operator is ready to accept more data.
* The global resource usage is the sum of resources used by all operators, active and requested (includes pending scheduling and pending node assignment).

Configuring the progress bar
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Depending on your use case, you may not be interested in the full progress bar output, or wish to turn them off altogether. Ray Data provides several ways to accomplish this:

* Disabling operator-level progress bars: Set `DataContext.get_current().enable_operator_progress_bars = False`. This only shows the global progress bar, and omits operator-level progress bars.
* Disabling all progress bars: Set `DataContext.get_current().enable_progress_bars = False`. This disables all progress bars from Ray Data related to dataset execution.
* Disabling `ray_tqdm`: Set `DataContext.get_current().use_ray_tqdm = False`. This configures Ray Data to use the base `tqdm` library instead of the custom distributed `tqdm` implementation, which could be useful when debugging logging issues in a distributed setting.

For operator names longer than a threshold of 100 characters, Ray Data truncates the names by default, to prevent the case when the operator names are long and the progress bar is too wide to fit on the screen.

* To turn off this behavior and show the full operator name, set `DataContext.get_current().enable_progress_bar_name_truncation = False`.
* To change the threshold of truncating the name, update the constant `ray.data._internal.progress_bar.ProgressBar.MAX_NAME_LENGTH = 42`.

.. _ray-data-dashboard:

Expand Down
2 changes: 1 addition & 1 deletion doc/source/ray-overview/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ We publish the dependencies that are installed in our ``ray`` Docker images for
.. tab-item:: ray (Python 3.9)
:sync: ray (Python 3.9)

Ray version: nightly (`679989c <https://github.com/ray-project/ray/commit/679989c00774c7fa0b94db711ce8eedda2190765>`_)
Ray version: nightly (`d2982b7 <https://github.com/ray-project/ray/commit/d2982b7b4f0e10e2f6143f932803017728387b73>`_)

.. literalinclude:: ./pip_freeze_ray-py39-cpu.txt

Expand Down
20 changes: 10 additions & 10 deletions doc/source/ray-overview/pip_freeze_ray-ml-py39-cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ ax-platform==0.3.2
azure-cli-core==2.40.0
azure-cli-telemetry==1.0.8
azure-common==1.1.28
azure-core==1.30.2
azure-core==1.31.0
azure-identity==1.10.0
azure-mgmt-compute==23.1.0
azure-mgmt-core==1.4.0
Expand All @@ -61,11 +61,11 @@ boto3==1.26.76
botocore==1.29.76
botorch==0.8.5
Brotli @ file:///croot/brotli-split_1714483155106/work
build==1.2.1
build==1.2.2
cached-property==1.5.2
cachetools==5.3.2
certifi==2023.11.17
cffi @ file:///croot/cffi_1714483155441/work
cffi==1.16.0
charset-normalizer==3.3.2
chess==1.7.0
chex==0.1.7
Expand Down Expand Up @@ -235,7 +235,7 @@ markdown-it-py==2.2.0
MarkupSafe==2.1.3
matplotlib==3.7.4
matplotlib-inline==0.1.6
mdit-py-plugins==0.4.1
mdit-py-plugins==0.4.2
mdurl==0.1.2
MedPy==0.4.0
memray==1.10.0
Expand Down Expand Up @@ -287,7 +287,7 @@ nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.20.5
nvidia-nvjitlink-cu12==12.6.20
nvidia-nvjitlink-cu12==12.6.68
nvidia-nvtx-cu12==12.1.105
oauth2client==4.1.3
oauthlib==3.2.2
Expand All @@ -299,9 +299,9 @@ opencensus-context==0.1.3
opencv-python==4.8.1.78
opentelemetry-api==1.1.0
opentelemetry-exporter-otlp==1.1.0
opentelemetry-exporter-otlp-proto-common==1.26.0
opentelemetry-exporter-otlp-proto-common==1.27.0
opentelemetry-exporter-otlp-proto-grpc==1.1.0
opentelemetry-exporter-otlp-proto-http==1.26.0
opentelemetry-exporter-otlp-proto-http==1.27.0
opentelemetry-proto==1.1.0
opentelemetry-sdk==1.1.0
opentelemetry-semantic-conventions==0.20b0
Expand Down Expand Up @@ -376,7 +376,7 @@ PyYAML==6.0.1
pyzmq==26.0.3
qpd==0.4.4
querystring-parser==1.2.4
ray @ file:///home/ray/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl#sha256=12b22b71023ef14a77ef7df4bbea91a302faeb76924ff36ad33547071720cfe4
ray @ file:///home/ray/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl#sha256=5d1e3379c141110a198d1405999bd3937976d0d003ba039a69c970407a05872c
recsim==0.2.4
redis==3.5.3
referencing==0.35.1
Expand Down Expand Up @@ -442,7 +442,7 @@ tensorflow-probability==0.23.0
tensorstore==0.1.63
termcolor==2.4.0
terminado==0.18.1
textual==0.77.0
textual==0.80.0
tf-slim==1.1.0
tf2onnx==1.15.1
threadpoolctl==3.1.0
Expand Down Expand Up @@ -474,7 +474,7 @@ typeguard==2.13.3
typer==0.12.3
types-python-dateutil==2.9.0.20240316
typing_extensions==4.8.0
tzdata==2024.1
tzdata==2024.2
uc-micro-py==1.0.3
uri-template==1.3.0
uritemplate==3.0.1
Expand Down
10 changes: 5 additions & 5 deletions doc/source/ray-overview/pip_freeze_ray-py39-cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ attrs==21.4.0
azure-cli-core==2.40.0
azure-cli-telemetry==1.0.8
azure-common==1.1.28
azure-core==1.30.2
azure-core==1.31.0
azure-identity==1.10.0
azure-mgmt-compute==23.1.0
azure-mgmt-core==1.4.0
Expand All @@ -27,7 +27,7 @@ botocore==1.29.165
Brotli @ file:///croot/brotli-split_1714483155106/work
cachetools==5.3.2
certifi==2023.11.17
cffi @ file:///croot/cffi_1714483155441/work
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
cloudpickle==2.2.0
Expand Down Expand Up @@ -122,15 +122,15 @@ pydantic_core==2.14.1
Pygments==2.18.0
PyJWT==2.9.0
PyNaCl==1.5.0
pyOpenSSL==22.1.0
pyOpenSSL==23.0.0
pyparsing==3.1.4
pyrsistent==0.20.0
PySocks @ file:///tmp/build/80754af9/pysocks_1605305812635/work
python-dateutil==2.8.2
python-dotenv==1.0.1
pytz==2022.7.1
PyYAML==6.0.1
ray @ file:///home/ray/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl#sha256=3ffd977059119b997d03fcf4914449c5109cef7f65cfcb84957b53ff741b004b
ray @ file:///home/ray/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl#sha256=29398ae540c9e4047328710b6bd8cce2cc9f4f82deb75db76a8632d6f602dea5
redis==3.5.3
requests==2.31.0
requests-oauthlib==2.0.0
Expand All @@ -149,7 +149,7 @@ starlette==0.36.3
tabulate==0.9.0
tensorboardX==2.6.2.2
tifffile==2024.7.21
tqdm @ file:///croot/tqdm_1716395931952/work
tqdm @ file:///croot/tqdm_1724853939799/work
typer==0.12.3
typing_extensions==4.8.0
uritemplate==3.0.1
Expand Down
5 changes: 4 additions & 1 deletion doc/source/rllib/doc_code/catalog_guide.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,10 @@ def __init__(self, *args, **kwargs):

config = (
PPOConfig()
.api_stack(enable_rl_module_and_learner=True)
.api_stack(
enable_rl_module_and_learner=True,
enable_env_runner_and_connector_v2=True,
)
.environment("CartPole-v1")
.framework("torch")
)
Expand Down
4 changes: 2 additions & 2 deletions doc/source/rllib/doc_code/checkpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import tempfile

from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.algorithms.dqn import DQNConfig
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.utils.checkpoints import convert_to_msgpack_checkpoint


# Base config used for both pickle-based checkpoint and msgpack-based one.
config = DQNConfig().environment("CartPole-v1")
config = PPOConfig().environment("CartPole-v1").env_runners(num_env_runners=0)
# Build algorithm object.
algo1 = config.build()

Expand Down
Loading

0 comments on commit 0838d2a

Please sign in to comment.