-
Notifications
You must be signed in to change notification settings - Fork 6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RLlib] Add APPO/IMPALA multi-agent StatelessCartPole learning tests to CI (+ fix some bugs related to this). #47245
Changes from 1 commit
12c7ce8
501d595
43502ed
a070bcc
7c747c9
387517a
310e06c
8305c0b
afa6fdb
6d79703
d5bd869
387ccda
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -184,23 +184,6 @@ py_test( | |
srcs = ["tuned_examples/appo/cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] | ||
) | ||
# StatelessCartPole | ||
py_test( | ||
name = "learning_tests_stateless_cartpole_appo", | ||
main = "tuned_examples/appo/stateless_cartpole_appo.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], | ||
size = "large", | ||
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"] | ||
) | ||
py_test( | ||
name = "learning_tests_stateless_cartpole_appo_multi_gpu", | ||
main = "tuned_examples/appo/stateless_cartpole_appo.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"], | ||
size = "large", | ||
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] | ||
) | ||
# MultiAgentCartPole | ||
py_test( | ||
name = "learning_tests_multi_agent_cartpole_appo", | ||
|
@@ -234,6 +217,72 @@ py_test( | |
srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"] | ||
) | ||
# StatelessCartPole | ||
py_test( | ||
name = "learning_tests_stateless_cartpole_appo", | ||
main = "tuned_examples/appo/stateless_cartpole_appo.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], | ||
size = "large", | ||
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"] | ||
) | ||
py_test( | ||
name = "learning_tests_stateless_cartpole_appo_gpu", | ||
main = "tuned_examples/appo/stateless_cartpole_appo.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"], | ||
size = "large", | ||
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=1"] | ||
) | ||
py_test( | ||
name = "learning_tests_stateless_cartpole_appo_multi_cpu", | ||
main = "tuned_examples/appo/stateless_cartpole_appo.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], | ||
size = "large", | ||
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] | ||
) | ||
py_test( | ||
name = "learning_tests_stateless_cartpole_appo_multi_gpu", | ||
main = "tuned_examples/appo/stateless_cartpole_appo.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"], | ||
size = "large", | ||
srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] | ||
) | ||
# MultiAgentStatelessCartPole | ||
py_test( | ||
name = "learning_tests_multi_agent_stateless_cartpole_appo", | ||
main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], | ||
size = "large", | ||
srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here. |
||
) | ||
py_test( | ||
name = "learning_tests_multi_agent_stateless_cartpole_appo_gpu", | ||
main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"], | ||
size = "large", | ||
srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=1"] | ||
) | ||
py_test( | ||
name = "learning_tests_multi_agent_stateless_cartpole_appo_multi_cpu", | ||
main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], | ||
size = "large", | ||
srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] | ||
) | ||
py_test( | ||
name = "learning_tests_multi_agent_stateless_cartpole_appo_multi_gpu", | ||
main = "tuned_examples/appo/multi_agent_stateless_cartpole_appo.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"], | ||
size = "large", | ||
srcs = ["tuned_examples/appo/multi_agent_stateless_cartpole_appo.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] | ||
) | ||
|
||
#@OldAPIStack | ||
py_test( | ||
|
@@ -462,23 +511,6 @@ py_test( | |
srcs = ["tuned_examples/impala/cartpole_impala.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] | ||
) | ||
# StatelessCartPole | ||
py_test( | ||
name = "learning_tests_stateless_cartpole_impala", | ||
main = "tuned_examples/impala/stateless_cartpole_impala.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], | ||
size = "large", | ||
srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"] | ||
) | ||
py_test( | ||
name = "learning_tests_stateless_cartpole_impala_multi_gpu", | ||
main = "tuned_examples/impala/stateless_cartpole_impala.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"], | ||
size = "large", | ||
srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] | ||
) | ||
# MultiAgentCartPole | ||
py_test( | ||
name = "learning_tests_multi_agent_cartpole_impala", | ||
|
@@ -512,6 +544,40 @@ py_test( | |
srcs = ["tuned_examples/impala/multi_agent_cartpole_impala.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-gpus=2", "--num-cpus=7"] | ||
) | ||
# StatelessCartPole | ||
py_test( | ||
name = "learning_tests_stateless_cartpole_impala", | ||
main = "tuned_examples/impala/stateless_cartpole_impala.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], | ||
size = "large", | ||
srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And here. I guess this brings us a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This actually tries to put the 1 (remote) Learner on 1 GPU. Sorry, you are right in that these command line options are very confusing: On a CPU machine: On a GPU machine: We should probably rename these args. |
||
) | ||
py_test( | ||
name = "learning_tests_stateless_cartpole_impala_multi_gpu", | ||
main = "tuned_examples/impala/stateless_cartpole_impala.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"], | ||
size = "large", | ||
srcs = ["tuned_examples/impala/stateless_cartpole_impala.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] | ||
) | ||
# MultiAgentStatelessCartPole | ||
py_test( | ||
name = "learning_tests_multi_agent_stateless_cartpole_impala", | ||
main = "tuned_examples/impala/multi_agent_stateless_cartpole_impala.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], | ||
size = "large", | ||
srcs = ["tuned_examples/impala/multi_agent_stateless_cartpole_impala.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=1"] | ||
) | ||
py_test( | ||
name = "learning_tests_multi_agent_stateless_cartpole_impala_multi_gpu", | ||
main = "tuned_examples/impala/multi_agent_stateless_cartpole_impala.py", | ||
tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"], | ||
size = "large", | ||
srcs = ["tuned_examples/impala/multi_agent_stateless_cartpole_impala.py"], | ||
args = ["--as-test", "--enable-new-api-stack", "--num-gpus=2"] | ||
) | ||
|
||
#@OldAPIstack | ||
py_test( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -228,7 +228,7 @@ def __call__( | |
# Also, let module-to-env pipeline know that we had added a single timestep | ||
# time rank to the data (to remove it again). | ||
if not self._as_learner_connector: | ||
for column, column_data in data.copy().items(): | ||
for column in data.keys(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. simplify |
||
self.foreach_batch_item_change_in_place( | ||
batch=data, | ||
column=column, | ||
|
@@ -250,11 +250,20 @@ def __call__( | |
# Before adding STATE_IN to the `data`, zero-pad existing data and batch | ||
# into max_seq_len chunks. | ||
for column, column_data in data.copy().items(): | ||
# Do not zero-pad INFOS column. | ||
if column == Columns.INFOS: | ||
continue | ||
for key, item_list in column_data.items(): | ||
if column != Columns.INFOS: | ||
column_data[key] = split_and_zero_pad_list( | ||
item_list, T=self.max_seq_len | ||
) | ||
# Multi-agent case AND RLModule is not stateful -> Do not zero-pad | ||
# for this model. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. bug fix: For multi-agent with some RLModules NOT stateful, we should NOT zero-pad anything. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this actually work already when using it on full length episodes coming from |
||
assert isinstance(key, tuple) | ||
if len(key) == 3: | ||
eps_id, aid, mid = key | ||
if not rl_module[mid].is_stateful(): | ||
continue | ||
column_data[key] = split_and_zero_pad_list( | ||
item_list, T=self.max_seq_len | ||
) | ||
|
||
for sa_episode in self.single_agent_episode_iterator( | ||
episodes, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -116,9 +116,16 @@ def __call__( | |
# anymore to the original observations). | ||
for sa_episode in self.single_agent_episode_iterator(episodes): | ||
sa_obs = sa_episode.get_observations(indices=-1) | ||
normalized_sa_obs = self._filters[sa_episode.agent_id]( | ||
sa_obs, update=self._update_stats | ||
) | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make the error better, that shows up when |
||
normalized_sa_obs = self._filters[sa_episode.agent_id]( | ||
sa_obs, update=self._update_stats | ||
) | ||
except KeyError: | ||
raise KeyError( | ||
"KeyError trying to access a filter by agent ID " | ||
f"`{sa_episode.agent_id}`! You probably did NOT pass the " | ||
f"`multi_agent=True` flag into the `MeanStdFilter()` constructor. " | ||
) | ||
sa_episode.set_observations(at_indices=-1, new_data=normalized_sa_obs) | ||
# We set the Episode's observation space to ours so that we can safely | ||
# set the last obs to the new value (without causing a space mismatch | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from ray.rllib.algorithms.appo import APPOConfig | ||
from ray.rllib.connectors.env_to_module import MeanStdFilter | ||
from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole | ||
from ray.rllib.utils.metrics import ( | ||
ENV_RUNNER_RESULTS, | ||
EPISODE_RETURN_MEAN, | ||
NUM_ENV_STEPS_SAMPLED_LIFETIME, | ||
) | ||
from ray.rllib.utils.test_utils import add_rllib_example_script_args | ||
from ray.tune.registry import register_env | ||
|
||
parser = add_rllib_example_script_args( | ||
default_timesteps=2000000, | ||
default_reward=350.0, | ||
) | ||
parser.set_defaults( | ||
enable_new_api_stack=True, | ||
num_agents=2, | ||
num_env_runners=3, | ||
) | ||
# Use `parser` to add your own custom command line options to this script | ||
# and (if needed) use their values toset up `config` below. | ||
args = parser.parse_args() | ||
|
||
register_env("env", lambda cfg: MultiAgentStatelessCartPole(config=cfg)) | ||
|
||
|
||
config = ( | ||
APPOConfig() | ||
# Enable new API stack and use EnvRunner. | ||
.api_stack( | ||
enable_rl_module_and_learner=True, | ||
enable_env_runner_and_connector_v2=True, | ||
) | ||
.environment("env", env_config={"num_agents": args.num_agents}) | ||
.env_runners( | ||
env_to_module_connector=lambda env: MeanStdFilter(multi_agent=True), | ||
) | ||
.training( | ||
lr=0.0005 * ((args.num_gpus or 1) ** 0.5), | ||
num_sgd_iter=6, | ||
vf_loss_coeff=0.05, | ||
grad_clip=20.0, | ||
) | ||
.rl_module( | ||
model_config_dict={ | ||
"vf_share_layers": True, | ||
"use_lstm": True, | ||
"uses_new_env_runners": True, | ||
"max_seq_len": 50, | ||
}, | ||
) | ||
.multi_agent( | ||
policy_mapping_fn=(lambda agent_id, episode, **kwargs: f"p{agent_id}"), | ||
policies={f"p{i}" for i in range(args.num_agents)}, | ||
) | ||
) | ||
|
||
stop = { | ||
f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 350.0 * args.num_agents, | ||
f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": args.stop_timesteps, | ||
} | ||
|
||
if __name__ == "__main__": | ||
from ray.rllib.utils.test_utils import run_rllib_example_script_experiment | ||
|
||
run_rllib_example_script_experiment(config, args, stop=stop) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,18 +20,15 @@ | |
# and (if needed) use their values to set up `config` below. | ||
args = parser.parse_args() | ||
|
||
register_env( | ||
"multi_agent_cartpole", | ||
lambda _: MultiAgentCartPole({"num_agents": args.num_agents}), | ||
) | ||
register_env("multi_agent_cartpole", lambda cfg: MultiAgentCartPole(config=cfg)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For DQN and SAC we have not stateful modules enables, yet. What do we need for it? The buffers need to collect time sequences, correct? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this is the huge advantage of the "episodes-until-the-last-second" design :) Everything now behaves the same and we can simply pass in a list of episodes (from offline data) into any Learner and its Learner connector pipelines behave the exact same. |
||
|
||
config = ( | ||
DQNConfig() | ||
.api_stack( | ||
enable_rl_module_and_learner=True, | ||
enable_env_runner_and_connector_v2=True, | ||
) | ||
.environment(env="multi_agent_cartpole") | ||
.environment(env="multi_agent_cartpole", env_config={"num_agents": args.num_agents}) | ||
.training( | ||
lr=0.0005 * (args.num_gpus or 1) ** 0.5, | ||
train_batch_size_per_learner=32, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is here missing a "gpu" while
num_gpus=1
or do we want to test here simply a remote learner?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct, we test here the simple case of: 1 (remote) Learner on 1 CPU.