Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Add APPO/IMPALA multi-agent StatelessCartPole learning tests to CI (+ fix some bugs related to this). #47245

134 changes: 100 additions & 34 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -184,23 +184,6 @@ py_test(
srcs = ["tuned_examples/appo/cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
# StatelessCartPole
py_test(
name = "learning_tests_stateless_cartpole_appo",
main = "tuned_examples/appo/stateless_cartpole_appo.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
size = "large",
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
)
py_test(
name = "learning_tests_stateless_cartpole_appo_multi_gpu",
main = "tuned_examples/appo/stateless_cartpole_appo.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
size = "large",
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
# MultiAgentCartPole
py_test(
name = "learning_tests_multi_agent_cartpole_appo",
Expand Down Expand Up @@ -234,6 +217,72 @@ py_test(
srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"]
)
# StatelessCartPole
py_test(
name = "learning_tests_stateless_cartpole_appo",
main = "tuned_examples/appo/stateless_cartpole_appo.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is here missing a "gpu" while num_gpus=1 or do we want to test here simply a remote learner?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, we test here the simple case of: 1 (remote) Learner on 1 CPU.

size = "large",
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
)
py_test(
name = "learning_tests_stateless_cartpole_appo_gpu",
main = "tuned_examples/appo/stateless_cartpole_appo.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
size = "large",
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=1"]
)
py_test(
name = "learning_tests_stateless_cartpole_appo_multi_cpu",
main = "tuned_examples/appo/stateless_cartpole_appo.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
size = "large",
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
py_test(
name = "learning_tests_stateless_cartpole_appo_multi_gpu",
main = "tuned_examples/appo/stateless_cartpole_appo.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
size = "large",
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
# MultiAgentStatelessCartPole
py_test(
name = "learning_tests_multi_agent_stateless_cartpole_appo",
main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
size = "large",
srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

)
py_test(
name = "learning_tests_multi_agent_stateless_cartpole_appo_gpu",
main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"],
size = "large",
srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=1"]
)
py_test(
name = "learning_tests_multi_agent_stateless_cartpole_appo_multi_cpu",
main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
size = "large",
srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
py_test(
name = "learning_tests_multi_agent_stateless_cartpole_appo_multi_gpu",
main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
size = "large",
srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)

#@OldAPIStack
py_test(
Expand Down Expand Up @@ -462,23 +511,6 @@ py_test(
srcs = ["tuned_examples/impala/cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
# StatelessCartPole
py_test(
name = "learning_tests_stateless_cartpole_impala",
main = "tuned_examples/impala/stateless_cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
size = "large",
srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
)
py_test(
name = "learning_tests_stateless_cartpole_impala_multi_gpu",
main = "tuned_examples/impala/stateless_cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
size = "large",
srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
# MultiAgentCartPole
py_test(
name = "learning_tests_multi_agent_cartpole_impala",
Expand Down Expand Up @@ -512,6 +544,40 @@ py_test(
srcs = ["tuned_examples/impala/multi_agent_cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"]
)
# StatelessCartPole
py_test(
name = "learning_tests_stateless_cartpole_impala",
main = "tuned_examples/impala/stateless_cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
size = "large",
srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And here. I guess this brings us a num_learners=1, doesn't it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This actually tries to put the 1 (remote) Learner on 1 GPU.

Sorry, you are right in that these command line options are very confusing:

On a CPU machine:
--num-gpus=1 -> 1 (remote) Learner (on CPU!)
--num-gpus=2 -> 2 (remote) Learners (on CPUs!)

On a GPU machine:
--num-gpus=1 -> 1 (remote) Learner (on GPU)
--num-gpus=2 -> 2 (remote) Learners (on GPUs)

We should probably rename these args.

)
py_test(
name = "learning_tests_stateless_cartpole_impala_multi_gpu",
main = "tuned_examples/impala/stateless_cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
size = "large",
srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)
# MultiAgentStatelessCartPole
py_test(
name = "learning_tests_multi_agent_stateless_cartpole_impala",
main = "tuned_examples/impala/multi_agent_stateless_cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"],
size = "large",
srcs = ["tuned_examples/impala/multi_agent_stateless_cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"]
)
py_test(
name = "learning_tests_multi_agent_stateless_cartpole_impala_multi_gpu",
main = "tuned_examples/impala/multi_agent_stateless_cartpole_impala.py",
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"],
size = "large",
srcs = ["tuned_examples/impala/multi_agent_stateless_cartpole_impala.py"],
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"]
)

#@OldAPIstack
py_test(
Expand Down
19 changes: 14 additions & 5 deletions rllib/connectors/common/add_states_from_episodes_to_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def __call__(
# Also, let module-to-env pipeline know that we had added a single timestep
# time rank to the data (to remove it again).
if not self._as_learner_connector:
for column, column_data in data.copy().items():
for column in data.keys():
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

simplify

self.foreach_batch_item_change_in_place(
batch=data,
column=column,
Expand All @@ -250,11 +250,20 @@ def __call__(
# Before adding STATE_IN to the `data`, zero-pad existing data and batch
# into max_seq_len chunks.
for column, column_data in data.copy().items():
# Do not zero-pad INFOS column.
if column == Columns.INFOS:
continue
for key, item_list in column_data.items():
if column != Columns.INFOS:
column_data[key] = split_and_zero_pad_list(
item_list, T=self.max_seq_len
)
# Multi-agent case AND RLModule is not stateful -> Do not zero-pad
# for this model.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bug fix: For multi-agent with some RLModules NOT stateful, we should NOT zero-pad anything.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this actually work already when using it on full length episodes coming from OfflineData?

assert isinstance(key, tuple)
if len(key) == 3:
eps_id, aid, mid = key
if not rl_module[mid].is_stateful():
continue
column_data[key] = split_and_zero_pad_list(
item_list, T=self.max_seq_len
)

for sa_episode in self.single_agent_episode_iterator(
episodes,
Expand Down
13 changes: 10 additions & 3 deletions rllib/connectors/env_to_module/mean_std_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,16 @@ def __call__(
# anymore to the original observations).
for sa_episode in self.single_agent_episode_iterator(episodes):
sa_obs = sa_episode.get_observations(indices=-1)
normalized_sa_obs = self._filters[sa_episode.agent_id](
sa_obs, update=self._update_stats
)
try:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make the error better, that shows up when multi_agent=True c'tor arg is forgotten.

normalized_sa_obs = self._filters[sa_episode.agent_id](
sa_obs, update=self._update_stats
)
except KeyError:
raise KeyError(
"KeyError trying to access a filter by agent ID "
f"`{sa_episode.agent_id}`! You probably did NOT pass the "
f"`multi_agent=True` flag into the `MeanStdFilter()` constructor. "
)
sa_episode.set_observations(at_indices=-1, new_data=normalized_sa_obs)
# We set the Episode's observation space to ours so that we can safely
# set the last obs to the new value (without causing a space mismatch
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from ray.rllib.connectors.connector_v2 import ConnectorV2
from ray.rllib.core.columns import Columns
from ray.rllib.core.rl_module.rl_module import RLModule
from ray.rllib.env.multi_agent_episode import MultiAgentEpisode
from ray.rllib.utils.annotations import override
from ray.rllib.utils.postprocessing.episodes import add_one_ts_to_episodes_and_truncate
from ray.rllib.utils.typing import EpisodeType
Expand Down Expand Up @@ -101,6 +102,12 @@ def __call__(
# batch: - - - - - - - T B0- - - - - R Bx- - - - R Bx
# mask : t t t t t t t t f t t t t t t f t t t t t f

if isinstance(episodes[0], MultiAgentEpisode):
for i, ma_episode in enumerate(episodes):
ma_episode.id_ += "_" + str(i)
for sa_episode in ma_episode.agent_episodes.values():
sa_episode.multi_agent_episode_id = ma_episode.id_

for i, sa_episode in enumerate(
self.single_agent_episode_iterator(episodes, agents_that_stepped_only=False)
):
Expand Down
4 changes: 2 additions & 2 deletions rllib/tuned_examples/appo/multi_agent_cartpole_appo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ray.rllib.utils.test_utils import add_rllib_example_script_args
from ray.tune.registry import register_env

parser = add_rllib_example_script_args()
parser = add_rllib_example_script_args(default_timesteps=2000000)
parser.set_defaults(
enable_new_api_stack=True,
num_agents=2,
Expand Down Expand Up @@ -46,7 +46,7 @@

stop = {
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 400.0 * args.num_agents,
f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 2000000,
f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": args.stop_timesteps,
}


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from ray.rllib.algorithms.appo import APPOConfig
from ray.rllib.connectors.env_to_module import MeanStdFilter
from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole
from ray.rllib.utils.metrics import (
ENV_RUNNER_RESULTS,
EPISODE_RETURN_MEAN,
NUM_ENV_STEPS_SAMPLED_LIFETIME,
)
from ray.rllib.utils.test_utils import add_rllib_example_script_args
from ray.tune.registry import register_env

parser = add_rllib_example_script_args(
default_timesteps=2000000,
default_reward=350.0,
)
parser.set_defaults(
enable_new_api_stack=True,
num_agents=2,
num_env_runners=3,
)
# Use `parser` to add your own custom command line options to this script
# and (if needed) use their values toset up `config` below.
args = parser.parse_args()

register_env("env", lambda cfg: MultiAgentStatelessCartPole(config=cfg))


config = (
APPOConfig()
# Enable new API stack and use EnvRunner.
.api_stack(
enable_rl_module_and_learner=True,
enable_env_runner_and_connector_v2=True,
)
.environment("env", env_config={"num_agents": args.num_agents})
.env_runners(
env_to_module_connector=lambda env: MeanStdFilter(multi_agent=True),
)
.training(
lr=0.0005 * ((args.num_gpus or 1) ** 0.5),
num_sgd_iter=6,
vf_loss_coeff=0.05,
grad_clip=20.0,
)
.rl_module(
model_config_dict={
"vf_share_layers": True,
"use_lstm": True,
"uses_new_env_runners": True,
"max_seq_len": 50,
},
)
.multi_agent(
policy_mapping_fn=(lambda agent_id, episode, **kwargs: f"p{agent_id}"),
policies={f"p{i}" for i in range(args.num_agents)},
)
)

stop = {
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 350.0 * args.num_agents,
f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": args.stop_timesteps,
}

if __name__ == "__main__":
from ray.rllib.utils.test_utils import run_rllib_example_script_experiment

run_rllib_example_script_experiment(config, args, stop=stop)
7 changes: 2 additions & 5 deletions rllib/tuned_examples/dqn/multi_agent_cartpole_dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,15 @@
# and (if needed) use their values to set up `config` below.
args = parser.parse_args()

register_env(
"multi_agent_cartpole",
lambda _: MultiAgentCartPole({"num_agents": args.num_agents}),
)
register_env("multi_agent_cartpole", lambda cfg: MultiAgentCartPole(config=cfg))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For DQN and SAC we have not stateful modules enables, yet. What do we need for it? The buffers need to collect time sequences, correct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is the huge advantage of the "episodes-until-the-last-second" design :) Everything now behaves the same and we can simply pass in a list of episodes (from offline data) into any Learner and its Learner connector pipelines behave the exact same.


config = (
DQNConfig()
.api_stack(
enable_rl_module_and_learner=True,
enable_env_runner_and_connector_v2=True,
)
.environment(env="multi_agent_cartpole")
.environment(env="multi_agent_cartpole", env_config={"num_agents": args.num_agents})
.training(
lr=0.0005 * (args.num_gpus or 1) ** 0.5,
train_batch_size_per_learner=32,
Expand Down
Loading