From 61c0c5166bf424a0aa2c1ccaa63e8b5602351fb5 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 20 May 2022 11:36:22 +0800 Subject: [PATCH] [Ray] Support basic subtask retry and lineage reconstruction (#2969) --- mars/conftest.py | 23 ++-- mars/dataframe/contrib/raydataset/dataset.py | 7 +- .../dataframe/contrib/raydataset/mldataset.py | 8 +- .../raydataset/tests/test_raydataset.py | 14 ++- mars/deploy/oscar/session.py | 84 +++++++++----- mars/deploy/oscar/tests/test_local.py | 9 +- mars/deploy/oscar/tests/test_ray.py | 8 ++ .../oscar/tests/test_ray_dag_failover.py | 108 ++++++++++++++++++ mars/deploy/utils.py | 4 + mars/services/task/execution/ray/config.py | 14 +++ mars/services/task/execution/ray/executor.py | 5 +- 11 files changed, 238 insertions(+), 46 deletions(-) create mode 100644 mars/deploy/oscar/tests/test_ray_dag_failover.py diff --git a/mars/conftest.py b/mars/conftest.py index fea96af609..14edb87563 100644 --- a/mars/conftest.py +++ b/mars/conftest.py @@ -96,10 +96,8 @@ def _ray_large_cluster(request): # pragma: no cover param = getattr(request, "param", {}) num_nodes = param.get("num_nodes", 3) num_cpus = param.get("num_cpus", 16) - try: - from ray.cluster_utils import Cluster - except ModuleNotFoundError: - from ray._private.cluster_utils import Cluster + from ray.cluster_utils import Cluster + cluster = Cluster() remote_nodes = [] for i in range(num_nodes): @@ -114,11 +112,14 @@ def _ray_large_cluster(request): # pragma: no cover except TypeError: job_config = None ray.init(address=cluster.address, job_config=job_config) - register_ray_serializers() + use_ray_serialization = param.get("use_ray_serialization", True) + if use_ray_serialization: + register_ray_serializers() try: - yield + yield cluster finally: - unregister_ray_serializers() + if use_ray_serialization: + unregister_ray_serializers() Router.set_instance(None) RayServer.clear() ray.shutdown() @@ -158,6 +159,14 @@ async def ray_create_mars_cluster(request): yield client +@pytest.fixture +def stop_mars(): + yield + import mars + + mars.stop_server() + + @pytest.fixture(scope="module") def _new_test_session(): from .deploy.oscar.tests.session import new_test_session diff --git a/mars/dataframe/contrib/raydataset/dataset.py b/mars/dataframe/contrib/raydataset/dataset.py index 6ec937d5b2..21581c59b8 100644 --- a/mars/dataframe/contrib/raydataset/dataset.py +++ b/mars/dataframe/contrib/raydataset/dataset.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import operator +from functools import reduce from ....utils import lazy_import from .mldataset import _rechunk_if_needed @@ -59,5 +61,6 @@ def __getstate__(): def get_chunk_refs(df): - fetched_infos: Dict[str, List] = df.fetch_infos(fields=["object_id"]) - return fetched_infos["object_id"] + fetched_infos: Dict[str, List] = df.fetch_infos(["object_refs"]) + object_refs = reduce(operator.concat, fetched_infos["object_refs"]) + return object_refs diff --git a/mars/dataframe/contrib/raydataset/mldataset.py b/mars/dataframe/contrib/raydataset/mldataset.py index f239d766f9..badba7446d 100644 --- a/mars/dataframe/contrib/raydataset/mldataset.py +++ b/mars/dataframe/contrib/raydataset/mldataset.py @@ -104,10 +104,12 @@ def to_ray_mldataset(df, num_shards: int = None): # chunk1 for addr1, # chunk2 & chunk3 for addr2, # chunk4 for addr1 - fetched_infos: Dict[str, List] = df.fetch_infos(fields=["band", "object_id"]) + fetched_infos: Dict[str, List] = df.fetch_infos(fields=["bands", "object_refs"]) chunk_addr_refs: List[Tuple[Tuple, "ray.ObjectRef"]] = [ - (band, object_id) - for band, object_id in zip(fetched_infos["band"], fetched_infos["object_id"]) + (bands[0], object_refs[0]) + for bands, object_refs in zip( + fetched_infos["bands"], fetched_infos["object_refs"] + ) ] group_to_obj_refs: Dict[str, List[ray.ObjectRef]] = _group_chunk_refs( chunk_addr_refs, num_shards diff --git a/mars/dataframe/contrib/raydataset/tests/test_raydataset.py b/mars/dataframe/contrib/raydataset/tests/test_raydataset.py index dff9becc4a..29aaeb3b8a 100644 --- a/mars/dataframe/contrib/raydataset/tests/test_raydataset.py +++ b/mars/dataframe/contrib/raydataset/tests/test_raydataset.py @@ -60,7 +60,12 @@ async def test_convert_to_ray_dataset( with session: value = np.random.rand(10, 10) chunk_size, num_shards = test_option - df: md.DataFrame = md.DataFrame(value, chunk_size=chunk_size) + # ray dataset needs str columns + df: md.DataFrame = md.DataFrame( + value, + chunk_size=chunk_size, + columns=[f"c{i}" for i in range(value.shape[1])], + ) df.execute() ds = mdd.to_ray_dataset(df, num_shards=num_shards) @@ -162,7 +167,8 @@ async def test_mars_with_xgboost_sklearn_reg(ray_start_regular_shared, create_cl session = new_session(address=create_cluster.address, default=True) with session: np_X, np_y = make_regression(n_samples=1_0000, n_features=10) - X, y = md.DataFrame(np_X), md.DataFrame({"target": np_y}) + columns = [f"c{i}" for i in range(np_X.shape[1])] + X, y = md.DataFrame(np_X, columns=columns), md.DataFrame({"target": np_y}) df: md.DataFrame = md.concat([md.DataFrame(X), md.DataFrame(y)], axis=1) df.execute() @@ -172,10 +178,10 @@ async def test_mars_with_xgboost_sklearn_reg(ray_start_regular_shared, create_cl import gc - gc.collect() # Ensure MLDataset does hold mars dataframe to avoid gc. + gc.collect() # Ensure Dataset does hold mars dataframe to avoid gc. ray_params = RayParams(num_actors=2, cpus_per_actor=1) reg = RayXGBRegressor(ray_params=ray_params, random_state=42) # train reg.fit(RayDMatrix(ds, "target"), y=None, ray_params=ray_params) reg.predict(RayDMatrix(ds, "target")) - reg.predict(pd.DataFrame(np_X)) + reg.predict(pd.DataFrame(np_X, columns=columns)) diff --git a/mars/deploy/oscar/session.py b/mars/deploy/oscar/session.py index 412afa55cb..cf08c962d7 100644 --- a/mars/deploy/oscar/session.py +++ b/mars/deploy/oscar/session.py @@ -1161,7 +1161,14 @@ async def fetch(self, *tileables, **kwargs) -> list: return result async def fetch_infos(self, *tileables, fields, **kwargs) -> list: - available_fields = {"object_id", "level", "memory_size", "store_size", "band"} + available_fields = { + "object_id", + "object_refs", + "level", + "memory_size", + "store_size", + "bands", + } if fields is None: fields = available_fields else: @@ -1175,34 +1182,22 @@ async def fetch_infos(self, *tileables, fields, **kwargs) -> list: if kwargs: # pragma: no cover unexpected_keys = ", ".join(list(kwargs.keys())) raise TypeError(f"`fetch` got unexpected arguments: {unexpected_keys}") - + # following fields needs to access storage API to get the meta. + _need_query_storage_fields = {"level", "memory_size", "store_size"} + _need_query_storage = bool(_need_query_storage_fields & fields) with enter_mode(build=True): - chunks = [] - get_chunk_metas = [] - fetch_infos_list = [] - for tileable in tileables: - fetch_tileable, _ = self._get_to_fetch_tileable(tileable) - fetch_infos = [] - for chunk in fetch_tileable.chunks: - chunks.append(chunk) - get_chunk_metas.append( - self._meta_api.get_chunk_meta.delay(chunk.key, fields=["bands"]) - ) - fetch_infos.append( - ChunkFetchInfo(tileable=tileable, chunk=chunk, indexes=None) - ) - fetch_infos_list.append(fetch_infos) - chunk_metas = await self._meta_api.get_chunk_meta.batch(*get_chunk_metas) - chunk_to_band = { - chunk: meta["bands"][0] for chunk, meta in zip(chunks, chunk_metas) - } - + chunk_to_bands, fetch_infos_list, result = await self._query_meta_service( + tileables, fields, _need_query_storage + ) + if not _need_query_storage: + assert result is not None + return result storage_api_to_gets = defaultdict(list) storage_api_to_fetch_infos = defaultdict(list) for fetch_info in itertools.chain(*fetch_infos_list): chunk = fetch_info.chunk - band = chunk_to_band[chunk] - storage_api = await self._get_storage_api(band) + bands = chunk_to_bands[chunk] + storage_api = await self._get_storage_api(bands[0]) storage_api_to_gets[storage_api].append( storage_api.get_infos.delay(chunk.key) ) @@ -1219,7 +1214,7 @@ async def fetch_infos(self, *tileables, fields, **kwargs) -> list: for fetch_infos in fetch_infos_list: fetched = defaultdict(list) for fetch_info in fetch_infos: - band = chunk_to_band[fetch_info.chunk] + bands = chunk_to_bands[fetch_info.chunk] # Currently there's only one item in the returned List from storage_api.get_infos() data = fetch_info.data[0] if "object_id" in fields: @@ -1232,12 +1227,47 @@ async def fetch_infos(self, *tileables, fields, **kwargs) -> list: fetched["store_size"].append(data.store_size) # data.band misses ip info, e.g. 'numa-0' # while band doesn't, e.g. (address0, 'numa-0') - if "band" in fields: - fetched["band"].append(band) + if "bands" in fields: + fetched["bands"].append(bands) result.append(fetched) return result + async def _query_meta_service(self, tileables, fields, query_storage): + chunks = [] + get_chunk_metas = [] + fetch_infos_list = [] + for tileable in tileables: + fetch_tileable, _ = self._get_to_fetch_tileable(tileable) + fetch_infos = [] + for chunk in fetch_tileable.chunks: + chunks.append(chunk) + get_chunk_metas.append( + self._meta_api.get_chunk_meta.delay( + chunk.key, + fields=["bands"] if query_storage else fields, + ) + ) + fetch_infos.append( + ChunkFetchInfo(tileable=tileable, chunk=chunk, indexes=None) + ) + fetch_infos_list.append(fetch_infos) + chunk_metas = await self._meta_api.get_chunk_meta.batch(*get_chunk_metas) + if not query_storage: + result = [] + chunk_to_meta = dict(zip(chunks, chunk_metas)) + for fetch_infos in fetch_infos_list: + fetched = defaultdict(list) + for fetch_info in fetch_infos: + for field in fields: + fetched[field].append(chunk_to_meta[fetch_info.chunk][field]) + result.append(fetched) + return {}, fetch_infos_list, result + chunk_to_bands = { + chunk: meta["bands"] for chunk, meta in zip(chunks, chunk_metas) + } + return chunk_to_bands, fetch_infos_list, None + async def decref(self, *tileable_keys): logger.debug("Decref tileables on client: %s", tileable_keys) return await self._lifecycle_api.decref_tileables(list(tileable_keys)) diff --git a/mars/deploy/oscar/tests/test_local.py b/mars/deploy/oscar/tests/test_local.py index 98f9bf6f27..6e078857b9 100644 --- a/mars/deploy/oscar/tests/test_local.py +++ b/mars/deploy/oscar/tests/test_local.py @@ -381,7 +381,12 @@ async def test_fetch_infos(create_cluster): assert "level" in fetched_infos assert "memory_size" in fetched_infos assert "store_size" in fetched_infos - assert "band" in fetched_infos + assert "bands" in fetched_infos + + fetched_infos = df.fetch_infos(fields=["object_id", "bands"]) + assert "object_id" in fetched_infos + assert "bands" in fetched_infos + assert len(fetched_infos) == 2 fetch_infos((df, df), fields=None) results_infos = mr.ExecutableTuple([df, df]).execute()._fetch_infos() @@ -390,7 +395,7 @@ async def test_fetch_infos(create_cluster): assert "level" in results_infos[0] assert "memory_size" in results_infos[0] assert "store_size" in results_infos[0] - assert "band" in results_infos[0] + assert "bands" in results_infos[0] async def _run_web_session_test(web_address): diff --git a/mars/deploy/oscar/tests/test_ray.py b/mars/deploy/oscar/tests/test_ray.py index 74bca575d3..b47e247fd9 100644 --- a/mars/deploy/oscar/tests/test_ray.py +++ b/mars/deploy/oscar/tests/test_ray.py @@ -14,12 +14,14 @@ import asyncio import copy +import operator import os import subprocess import sys import tempfile import threading import time +from functools import reduce import numpy as np import pandas as pd @@ -154,6 +156,12 @@ async def test_execute_describe(ray_start_regular, create_cluster): @pytest.mark.asyncio async def test_fetch_infos(ray_start_regular, create_cluster): await test_local.test_fetch_infos(create_cluster) + df = md.DataFrame(mt.random.RandomState(0).rand(5000, 1, chunk_size=1000)) + df.execute() + fetched_infos = df.fetch_infos(fields=["object_refs"]) + object_refs = reduce(operator.concat, fetched_infos["object_refs"]) + assert len(fetched_infos) == 1 + assert len(object_refs) == 5 @require_ray diff --git a/mars/deploy/oscar/tests/test_ray_dag_failover.py b/mars/deploy/oscar/tests/test_ray_dag_failover.py new file mode 100644 index 0000000000..bf337b05d6 --- /dev/null +++ b/mars/deploy/oscar/tests/test_ray_dag_failover.py @@ -0,0 +1,108 @@ +# Copyright 1999-2021 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import operator +from functools import reduce + +import pandas as pd +import pytest + +import mars +from .... import dataframe as md +from .... import tensor as mt +from ....tests.core import require_ray +from ....utils import lazy_import + +ray = lazy_import("ray") + + +@require_ray +@pytest.mark.parametrize( + "ray_large_cluster", + [{"num_nodes": 0, "use_ray_serialization": False}], + indirect=True, +) +@pytest.mark.parametrize("reconstruction_enabled", [True, False]) +def test_basic_object_reconstruction( + ray_large_cluster, reconstruction_enabled, stop_mars +): + config = { + "num_heartbeats_timeout": 10, + "raylet_heartbeat_period_milliseconds": 200, + "object_timeout_milliseconds": 200, + } + # Workaround to reset the config to the default value. + if not reconstruction_enabled: + config["lineage_pinning_enabled"] = False + subtask_max_retries = 0 + else: + subtask_max_retries = 1 + + cluster = ray_large_cluster + # Head node with no resources. + cluster.add_node( + num_cpus=0, + _system_config=config, + enable_object_reconstruction=reconstruction_enabled, + ) + ray.init(address=cluster.address) + # Node to place the initial object. + node_to_kill = cluster.add_node(num_cpus=1, object_store_memory=10**8) + mars.new_session( + backend="ray", + config={"scheduling.subtask_max_retries": subtask_max_retries}, + default=True, + ) + cluster.wait_for_nodes() + + df = md.DataFrame(mt.random.RandomState(0).rand(2_000_000, 1, chunk_size=1_000_000)) + df.execute() + # this will submit new ray tasks + df2 = df.map_chunk(lambda pdf: pdf * 2).execute() + executed_infos = df2.fetch_infos(fields=["object_refs"]) + object_refs = reduce(operator.concat, executed_infos["object_refs"]) + head5 = df2.head(5).to_pandas() + + cluster.remove_node(node_to_kill, allow_graceful=False) + node_to_kill = cluster.add_node(num_cpus=1, object_store_memory=10**8) + + # use a dependent_task to avoid fetch lost objects to local + @ray.remote + def dependent_task(x): + return x + + if reconstruction_enabled: + ray.get([dependent_task.remote(ref) for ref in object_refs]) + new_head5 = df2.head(5).to_pandas() + pd.testing.assert_frame_equal(head5, new_head5) + else: + with pytest.raises(ray.exceptions.RayTaskError): + df2.head(5).to_pandas() + with pytest.raises(ray.exceptions.ObjectLostError): + ray.get(object_refs) + + # Losing the object a second time will cause reconstruction to fail because + # we have reached the max task retries. + cluster.remove_node(node_to_kill, allow_graceful=False) + cluster.add_node(num_cpus=1, object_store_memory=10**8) + + if reconstruction_enabled: + with pytest.raises( + ray.exceptions.ObjectReconstructionFailedMaxAttemptsExceededError + ): + ray.get(object_refs) + else: + with pytest.raises(ray.exceptions.ObjectLostError): + ray.get(object_refs) diff --git a/mars/deploy/utils.py b/mars/deploy/utils.py index 3f7378289b..8ca658cd96 100644 --- a/mars/deploy/utils.py +++ b/mars/deploy/utils.py @@ -163,6 +163,10 @@ def load_config(config: Union[str, Dict], default_config_file: str): "ensure enough homogeneous subtasks to calculate statistics." ) config["task"]["default_config"]["initial_same_color_num"] = 1 + ray_execution_config = config["task"]["execution_config"].setdefault("ray", {}) + subtask_max_retries = config["scheduling"].get("subtask_max_retries") + if subtask_max_retries is not None: + ray_execution_config.setdefault("subtask_max_retries", subtask_max_retries) return config diff --git a/mars/services/task/execution/ray/config.py b/mars/services/task/execution/ray/config.py index f87deeefd7..1af7899995 100644 --- a/mars/services/task/execution/ray/config.py +++ b/mars/services/task/execution/ray/config.py @@ -18,10 +18,20 @@ from ..utils import get_band_resources_from_config +# the default times to retry subtask. +DEFAULT_SUBTASK_MAX_RETRIES = 3 + + @register_config_cls class RayExecutionConfig(ExecutionConfig): name = "ray" + def __init__(self, execution_config: Dict): + super().__init__(execution_config) + self._subtask_max_retries = self._execution_config.get("ray", {}).get( + "subtask_max_retries", DEFAULT_SUBTASK_MAX_RETRIES + ) + def get_band_resources(self): """ Get the band resources from config for generating ray virtual @@ -31,3 +41,7 @@ def get_band_resources(self): def get_deploy_band_resources(self) -> List[Dict[str, Resource]]: return [] + + @property + def subtask_max_retries(self): + return self._subtask_max_retries diff --git a/mars/services/task/execution/ray/executor.py b/mars/services/task/execution/ray/executor.py index 7b52048ea4..201ba8da74 100644 --- a/mars/services/task/execution/ray/executor.py +++ b/mars/services/task/execution/ray/executor.py @@ -283,8 +283,11 @@ async def execute_subtask_graph( output_keys = self._get_subtask_output_keys(subtask_chunk_graph) output_meta_keys = result_meta_keys & output_keys output_count = len(output_keys) + bool(output_meta_keys) + subtask_max_retries = ( + self._config.subtask_max_retries if subtask.retryable else 0 + ) output_object_refs = self._ray_executor.options( - num_returns=output_count + num_returns=output_count, max_retries=subtask_max_retries ).remote( subtask.task_id, subtask.subtask_id,