From 231a013d1ccad59f92a44dda1fa6fd03f2e1d5a8 Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Wed, 19 Jun 2024 11:54:08 +0200 Subject: [PATCH] [RLlib] IMPALA on new API stack (w/ EnvRunner- and ConnectorV2 APIs). (#42085) --- rllib/BUILD | 137 +- rllib/algorithms/algorithm.py | 114 +- rllib/algorithms/algorithm_config.py | 26 +- rllib/algorithms/appo/appo.py | 38 +- rllib/algorithms/appo/appo_learner.py | 10 + rllib/algorithms/appo/appo_tf_policy.py | 16 +- rllib/algorithms/appo/appo_torch_policy.py | 11 +- .../appo/tests/test_appo_learner.py | 76 +- .../appo/tests/test_appo_off_policyness.py | 55 - .../appo/torch/appo_torch_learner.py | 22 +- rllib/algorithms/impala/impala.py | 1098 ++++++++++------- rllib/algorithms/impala/impala_learner.py | 297 +++-- .../impala/tests/test_impala_learner.py | 106 -- .../tests/test_impala_off_policyness.py | 60 - .../algorithms/impala/tf/impala_tf_learner.py | 1 - .../impala/torch/impala_torch_learner.py | 121 +- .../impala/torch/vtrace_torch_v2.py | 30 +- rllib/algorithms/ppo/ppo_learner.py | 22 +- .../ppo/tests/test_ppo_with_env_runner.py | 79 +- .../algorithms/ppo/torch/ppo_torch_learner.py | 8 +- .../ppo/torch/ppo_torch_rl_module.py | 11 +- .../common/batch_individual_items.py | 33 +- rllib/connectors/common/numpy_to_tensor.py | 43 +- rllib/connectors/learner/__init__.py | 4 + .../add_one_ts_to_episodes_and_truncate.py | 131 ++ rllib/core/learner/learner.py | 96 +- rllib/core/learner/learner_group.py | 260 ++-- rllib/core/learner/torch/torch_learner.py | 49 +- rllib/core/rl_module/marl_module.py | 26 +- rllib/env/env_runner.py | 40 +- rllib/env/env_runner_group.py | 201 +-- rllib/env/multi_agent_env_runner.py | 161 ++- rllib/env/single_agent_env_runner.py | 243 ++-- .../env/tests/test_single_agent_env_runner.py | 98 +- rllib/env/utils/infinite_lookback_buffer.py | 3 + rllib/evaluation/postprocessing_v2.py | 38 - .../flatten_observations_dict_space.py | 46 +- rllib/examples/connectors/frame_stacking.py | 16 +- .../examples/connectors/mean_std_filtering.py | 17 +- rllib/examples/envs/custom_gym_env.py | 2 +- ...cy_inference_after_training_w_connector.py | 2 +- ...ock_paper_scissors_heuristic_vs_learned.py | 1 + .../rl_modules/classes/tiny_atari_cnn.py | 60 +- rllib/examples/rl_modules/custom_rl_module.py | 24 +- rllib/policy/sample_batch.py | 15 +- rllib/tests/test_nested_observation_spaces.py | 115 +- .../appo/cartpole-appo-separate-losses.py | 2 +- ...artpole-appo-w-rl-modules-and-learner.yaml | 27 - .../impala/cartpole-impala.yaml | 2 +- .../tuned_examples/impala/cartpole_impala.py | 46 + .../multi_agent_cartpole_impala_envrunner.py | 46 + .../tuned_examples/impala/pendulum_impala.py | 46 + rllib/tuned_examples/impala/pong_impala.py | 93 ++ .../impala/pong_impala_pb2_hyperopt.py | 130 ++ rllib/utils/actor_manager.py | 3 +- rllib/utils/metrics/__init__.py | 13 + rllib/utils/metrics/stats.py | 22 +- rllib/utils/minibatch_utils.py | 32 + rllib/utils/postprocessing/episodes.py | 2 +- .../utils/postprocessing/value_predictions.py | 2 + rllib/utils/test_utils.py | 27 +- rllib/utils/torch_utils.py | 19 +- 62 files changed, 2835 insertions(+), 1739 deletions(-) delete mode 100644 rllib/algorithms/appo/tests/test_appo_off_policyness.py delete mode 100644 rllib/algorithms/impala/tests/test_impala_learner.py delete mode 100644 rllib/algorithms/impala/tests/test_impala_off_policyness.py create mode 100644 rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py delete mode 100644 rllib/evaluation/postprocessing_v2.py delete mode 100644 rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml create mode 100644 rllib/tuned_examples/impala/cartpole_impala.py create mode 100644 rllib/tuned_examples/impala/multi_agent_cartpole_impala_envrunner.py create mode 100644 rllib/tuned_examples/impala/pendulum_impala.py create mode 100644 rllib/tuned_examples/impala/pong_impala.py create mode 100644 rllib/tuned_examples/impala/pong_impala_pb2_hyperopt.py diff --git a/rllib/BUILD b/rllib/BUILD index 8b8dc04b2fc7..08443b2b4000 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -152,16 +152,6 @@ py_test( # -------------------------------------------------------------------- # APPO -#@OldAPIStack -py_test( - name = "learning_tests_cartpole_appo_hybrid_api_stack", - main = "tests/run_regression_tests.py", - tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "no_tf_static_graph"], - size = "medium", # bazel may complain about it being too long sometimes - medium is on purpose as some frameworks take longer - srcs = ["tests/run_regression_tests.py"], - data = ["tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml"], - args = ["--dir=tuned_examples/appo"] -) #@OldAPIStack py_test( @@ -335,18 +325,16 @@ py_test( ) # IMPALA -#@OldAPIStack -# py_test( -# name = "learning_tests_cartpole_impala_old_api_stack", -# main = "tests/run_regression_tests.py", -# tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"], -# size = "large", -# srcs = ["tests/run_regression_tests.py"], -# data = ["tuned_examples/impala/cartpole-impala.yaml"], -# args = ["--dir=tuned_examples/impala"] -# ) +py_test( + name = "learning_tests_cartpole_impala", + main = "tuned_examples/impala/cartpole_impala.py", + tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "torch_only"], + size = "large", + srcs = ["tuned_examples/impala/cartpole_impala.py"], + args = ["--as-test", "--enable-new-api-stack"] +) -#@OldAPIStack +#@OldAPIstack py_test( name = "learning_tests_cartpole_separate_losses_impala_old_api_stack", main = "tests/run_regression_tests.py", @@ -578,12 +566,6 @@ py_test( size = "large", srcs = ["algorithms/appo/tests/test_appo.py"] ) -py_test( - name = "test_appo_off_policyness", - tags = ["team:rllib", "algorithms_dir", "multi_gpu", "exclusive"], - size = "large", - srcs = ["algorithms/appo/tests/test_appo_off_policyness.py"] -) py_test( name = "test_appo_learner", tags = ["team:rllib", "algorithms_dir"], @@ -632,7 +614,7 @@ py_test( srcs = ["algorithms/dreamerv3/tests/test_dreamerv3.py"] ) -# Impala +# IMPALA py_test( name = "test_impala", tags = ["team:rllib", "algorithms_dir"], @@ -651,18 +633,6 @@ py_test( size = "small", srcs = ["algorithms/impala/tests/test_vtrace_v2.py"] ) -py_test( - name = "test_impala_off_policyness", - tags = ["team:rllib", "algorithms_dir", "exclusive"], - size = "large", - srcs = ["algorithms/impala/tests/test_impala_off_policyness.py"] -) -py_test( - name = "test_impala_learner", - tags = ["team:rllib", "algorithms_dir"], - size = "medium", - srcs = ["algorithms/impala/tests/test_impala_learner.py"] -) # MARWIL py_test( @@ -739,8 +709,9 @@ py_test( # Tag: memory_leak_tests # -------------------------------------------------------------------- +# @OldAPIStack py_test( - name = "test_memory_leak_appo", + name = "test_memory_leak_appo_old_api_stack", tags = ["team:rllib", "memory_leak_tests"], main = "utils/tests/run_memory_leak_tests.py", size = "large", @@ -748,9 +719,9 @@ py_test( data = ["tuned_examples/appo/memory-leak-test-appo.yaml"], args = ["--dir=tuned_examples/appo"] ) - +# @OldAPIStack py_test( - name = "test_memory_leak_dqn", + name = "test_memory_leak_dqn_old_api_stack", tags = ["team:rllib", "memory_leak_tests"], main = "utils/tests/run_memory_leak_tests.py", size = "large", @@ -758,9 +729,9 @@ py_test( data = ["tuned_examples/dqn/memory-leak-test-dqn.yaml"], args = ["--dir=tuned_examples/dqn"] ) - +# @OldAPIStack py_test( - name = "test_memory_leak_impala", + name = "test_memory_leak_impala_old_api_stack", tags = ["team:rllib", "memory_leak_tests"], main = "utils/tests/run_memory_leak_tests.py", size = "large", @@ -768,9 +739,9 @@ py_test( data = ["tuned_examples/impala/memory-leak-test-impala.yaml"], args = ["--dir=tuned_examples/impala"] ) - +# @OldAPIStack py_test( - name = "test_memory_leak_ppo", + name = "test_memory_leak_ppo_old_api_stack", tags = ["team:rllib", "memory_leak_tests"], main = "utils/tests/run_memory_leak_tests.py", size = "large", @@ -778,19 +749,9 @@ py_test( data = ["tuned_examples/ppo/memory-leak-test-ppo.yaml"], args = ["--dir=tuned_examples/ppo"] ) - -py_test( - name = "test_memory_leak_ppo_new_stack", - tags = ["team:rllib", "memory_leak_tests"], - main = "utils/tests/run_memory_leak_tests.py", - size = "large", - srcs = ["utils/tests/run_memory_leak_tests.py"], - data = ["tuned_examples/ppo/memory_leak_test_ppo_new_stack.py"], - args = ["--dir=tuned_examples/ppo", "--to-check=rollout_worker"] -) - +# @OldAPIStack py_test( - name = "test_memory_leak_sac", + name = "test_memory_leak_sac_old_api_stack", tags = ["team:rllib", "memory_leak_tests"], main = "utils/tests/run_memory_leak_tests.py", size = "large", @@ -2240,6 +2201,7 @@ py_test( # subdirectory: connectors/ # .................................... # Framestacking examples only run in smoke-test mode (a few iters only). +# PPO py_test( name = "examples/connectors/frame_stacking_ppo", main = "examples/connectors/frame_stacking.py", @@ -2257,8 +2219,26 @@ py_test( srcs = ["examples/connectors/frame_stacking.py"], args = ["--enable-new-api-stack", "--num-agents=2", "--stop-iter=2", "--framework=torch", "--algo=PPO", "--num-env-runners=4", "--num-cpus=6"] ) +# IMPALA +py_test( + name = "examples/connectors/frame_stacking_impala", + main = "examples/connectors/frame_stacking.py", + tags = ["team:rllib", "exclusive", "examples"], + size = "medium", + srcs = ["examples/connectors/frame_stacking.py"], + args = ["--enable-new-api-stack", "--stop-iter=2", "--framework=torch", "--algo=IMPALA"] +) +py_test( + name = "examples/connectors/frame_stacking_multi_agent_impala", + main = "examples/connectors/frame_stacking.py", + tags = ["team:rllib", "exclusive", "examples"], + size = "medium", + srcs = ["examples/connectors/frame_stacking.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--stop-iter=2", "--framework=torch", "--algo=IMPALA", "--num-env-runners=4", "--num-cpus=6"] +) # Nested observation spaces (flattening). +# PPO py_test( name = "examples/connectors/flatten_observations_dict_space_ppo", main = "examples/connectors/flatten_observations_dict_space.py", @@ -2267,7 +2247,6 @@ py_test( srcs = ["examples/connectors/flatten_observations_dict_space.py"], args = ["--enable-new-api-stack", "--as-test", "--stop-reward=400.0", "--framework=torch", "--algo=PPO"] ) - py_test( name = "examples/connectors/flatten_observations_dict_space_multi_agent_ppo", main = "examples/connectors/flatten_observations_dict_space.py", @@ -2276,6 +2255,23 @@ py_test( srcs = ["examples/connectors/flatten_observations_dict_space.py"], args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=800.0", "--framework=torch", "--algo=PPO"] ) +# IMPALA +py_test( + name = "examples/connectors/flatten_observations_dict_space_impala", + main = "examples/connectors/flatten_observations_dict_space.py", + tags = ["team:rllib", "exclusive", "examples"], + size = "large", + srcs = ["examples/connectors/flatten_observations_dict_space.py"], + args = ["--enable-new-api-stack", "--as-test", "--stop-reward=400.0", "--stop-timesteps=2000000", "--framework=torch", "--algo=IMPALA"] +) +py_test( + name = "examples/connectors/flatten_observations_dict_space_multi_agent_impala", + main = "examples/connectors/flatten_observations_dict_space.py", + tags = ["team:rllib", "exclusive", "examples"], + size = "large", + srcs = ["examples/connectors/flatten_observations_dict_space.py"], + args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=800.0", "--stop-timesteps=2000000", "--framework=torch", "--algo=IMPALA"] +) # Prev-r/prev actions + LSTM example. py_test( @@ -2297,6 +2293,7 @@ py_test( ) # MeanStd filtering example. +# PPO py_test( name = "examples/connectors/mean_std_filtering_ppo", main = "examples/connectors/mean_std_filtering.py", @@ -2314,6 +2311,25 @@ py_test( srcs = ["examples/connectors/mean_std_filtering.py"], args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=-600.0", "--framework=torch", "--algo=PPO", "--num-env-runners=5", "--num-cpus=7"] ) +# IMPALA +# TODO (sven): Make IMPALA learn Pendulum OR make this script flexible to accept +# (lopsided obs) CartPole as well. +# py_test( +# name = "examples/connectors/mean_std_filtering_impala", +# main = "examples/connectors/mean_std_filtering.py", +# tags = ["team:rllib", "exclusive", "examples"], +# size = "medium", +# srcs = ["examples/connectors/mean_std_filtering.py"], +# args = ["--enable-new-api-stack", "--as-test", "--stop-reward=-300.0", "--framework=torch", "--algo=IMPALA", "--num-env-runners=2"] +# ) +# py_test( +# name = "examples/connectors/mean_std_filtering_multi_agent_impala", +# main = "examples/connectors/mean_std_filtering.py", +# tags = ["team:rllib", "exclusive", "examples"], +# size = "medium", +# srcs = ["examples/connectors/mean_std_filtering.py"], +# args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=-600.0", "--framework=torch", "--algo=IMPALA", "--num-env-runners=5", "--num-cpus=6"] +# ) # subdirectory: curriculum/ # .................................... @@ -3015,7 +3031,6 @@ py_test( args = ["--run=IMPALA", "--as-test", "--framework=torch", "--stop-reward=28", "--num-cpus=4"] ) -# TODO (Kourosh): tf2 ~5x slower compared to torch on the new stack py_test( name = "examples/cartpole_lstm_ppo_tf2", main = "examples/cartpole_lstm.py", diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 876a47b121f4..cd4a1f8c8a08 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -174,7 +174,7 @@ def _get_learner_bundles(cf: AlgorithmConfig) -> List[Dict[str, int]]: """Selects the right resource bundles for learner workers based off of cf. Args: - cf: The algorithm config. + cf: The AlgorithmConfig instance to extract bundle-information from. Returns: A list of resource bundles for the learner workers. @@ -187,7 +187,7 @@ def _get_learner_bundles(cf: AlgorithmConfig) -> List[Dict[str, int]]: elif cf.num_cpus_per_learner: learner_bundles = [ { - "CPU": cf.num_cpus_per_learner * cf.num_learners, + "CPU": cf.num_learners * cf.num_cpus_per_learner, } ] else: @@ -480,13 +480,17 @@ def __init__( # components (including timers, counters and other stats in its own # `training_step()` and other methods) as well as custom callbacks. self.metrics = MetricsLogger() - # Initialize lifetime counters. + # Initialize lifetime counters (or those that are common as Tune stop criteria. + # We don't want tune to crash regularly b/c these stats might be still missing + # entirely after the first few iterations. self.metrics.log_dict( { NUM_ENV_STEPS_SAMPLED_LIFETIME: 0, NUM_AGENT_STEPS_SAMPLED_LIFETIME: {DEFAULT_AGENT_ID: 0}, NUM_ENV_STEPS_TRAINED_LIFETIME: 0, NUM_AGENT_STEPS_TRAINED_LIFETIME: {DEFAULT_AGENT_ID: 0}, + NUM_EPISODES_LIFETIME: 0, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": np.nan, }, reduce="sum", ) @@ -647,9 +651,6 @@ def setup(self, config: AlgorithmConfig) -> None: logdir=self.logdir, ) - # Ensure remote workers are initially in sync with the local worker. - self.workers.sync_weights(inference_only=True) - # Compile, validate, and freeze an evaluation config. self.evaluation_config = self.config.get_evaluation_config_object() self.evaluation_config.validate() @@ -729,7 +730,6 @@ def setup(self, config: AlgorithmConfig) -> None: # Need to add back method_type in case Algorithm is restored from checkpoint method_config["type"] = method_type - self.learner_group = None if self.config.enable_rl_module_and_learner: local_worker = self.workers.local_worker() env = spaces = None @@ -787,13 +787,23 @@ def setup(self, config: AlgorithmConfig) -> None: self.workers.foreach_worker( lambda w: w.set_is_policy_to_train(policies_to_train), ) - - # Sync the weights from the learner group to the rollout workers. - weights = self.learner_group.get_weights( - inference_only=self.config.enable_env_runner_and_connector_v2 - ) - local_worker.set_weights(weights) - self.workers.sync_weights(inference_only=True) + # Sync the weights from the learner group to the rollout workers. + weights = self.learner_group.get_weights() + local_worker.set_weights(weights) + self.workers.sync_weights(inference_only=True) + # New stack/EnvRunner APIs: Use get/set_state (no more get/set_weights). + else: + # Sync the weights from the learner group to the rollout workers. + weights = self.learner_group.get_weights( + inference_only=self.config.enable_env_runner_and_connector_v2 + ) + local_worker.set_state({"rl_module": weights}) + self.workers.sync_env_runner_states( + config=self.config, + env_steps_sampled=self.metrics.peek( + NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0 + ), + ) # Run `on_algorithm_init` callback after initialization is done. self.callbacks.on_algorithm_init(algorithm=self, metrics_logger=self.metrics) @@ -889,7 +899,7 @@ def step(self) -> ResultDict: ) eval_results = self.evaluation_metrics - # Sync filters on workers. + # Sync EnvRunner workers. # TODO (sven): For the new API stack, the common execution pattern for any algo # should be: [sample + get_metrics + get_state] -> send all these in one remote # call down to `training_step` (where episodes are sent as ray object @@ -897,13 +907,16 @@ def step(self) -> ResultDict: # in special key in result dict and perform the connector merge/broadcast # inside the `training_step` as well. See the new IMPALA for an example. if self.config.enable_env_runner_and_connector_v2: - # Synchronize EnvToModule and ModuleToEnv connector states and broadcast new - # states back to all EnvRunners. - with self.metrics.log_time((TIMERS, SYNCH_ENV_CONNECTOR_STATES_TIMER)): - self.workers.sync_env_runner_states( - config=self.config, - env_steps_sampled=self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME), - ) + if not self.config._dont_auto_sync_env_runner_states: + # Synchronize EnvToModule and ModuleToEnv connector states and broadcast + # new states back to all EnvRunners. + with self.metrics.log_time((TIMERS, SYNCH_ENV_CONNECTOR_STATES_TIMER)): + self.workers.sync_env_runner_states( + config=self.config, + env_steps_sampled=self.metrics.peek( + NUM_ENV_STEPS_SAMPLED_LIFETIME + ), + ) # Compile final ResultDict from `train_results` and `eval_results`. Note # that, as opposed to the old API stack, EnvRunner stats should already be # in `train_results` and `eval_results`. @@ -2843,19 +2856,28 @@ def __getstate__(self) -> Dict: "config": self.config, } - if hasattr(self, "workers"): - state["worker"] = self.workers.local_worker().get_state() + # New API stack. + if self.config.enable_env_runner_and_connector_v2: + # Save entire MetricsLogger state. + state["metrics_logger"] = self.metrics.get_state() - # Also store eval `policy_mapping_fn` (in case it's different from main one). - # Note, the new `EnvRunner API` has no policy mapping function. - if ( - hasattr(self, "evaluation_workers") - and self.evaluation_workers is not None - and not self.config.enable_env_runner_and_connector_v2 - ): - state[ - "eval_policy_mapping_fn" - ] = self.evaluation_workers.local_worker().policy_mapping_fn + # Old API stack. + else: + if hasattr(self, "workers"): + state["worker"] = self.workers.local_worker().get_state() + + # Also store eval `policy_mapping_fn` (in case it's different from main + # one). Note, the new `EnvRunner API` has no policy mapping function. + if ( + hasattr(self, "evaluation_workers") + and self.evaluation_workers is not None + ): + state[ + "eval_policy_mapping_fn" + ] = self.evaluation_workers.local_worker().policy_mapping_fn + + # Save counters. + state["counters"] = self._counters # TODO: Experimental functionality: Store contents of replay buffer # to checkpoint, only if user has configured this. @@ -2864,13 +2886,6 @@ def __getstate__(self) -> Dict: ): state["local_replay_buffer"] = self.local_replay_buffer.get_state() - # New API stack: Save entire MetricsLogger state. - if self.config.enable_env_runner_and_connector_v2: - state["metrics_logger"] = self.metrics.get_state() - # Old API stack: Save only counters. - else: - state["counters"] = self._counters - # Save current `training_iteration`. state[TRAINING_ITERATION] = self.training_iteration @@ -3140,19 +3155,25 @@ def _run_one_training_iteration(self) -> Tuple[ResultDict, "TrainIterCtx"]: if self.config.get("framework") == "tf2" and not tf.executing_eagerly(): tf1.enable_eager_execution() - results = None + results = {} + training_step_results = None # Create a step context ... with TrainIterCtx(algo=self) as train_iter_ctx: # .. so we can query it whether we should stop the iteration loop (e.g. # when we have reached `min_time_s_per_iteration`). - while not train_iter_ctx.should_stop(results): + while not train_iter_ctx.should_stop(training_step_results): # Before training step, try to bring failed workers back. with self._timers[RESTORE_WORKERS_TIMER]: self.restore_workers(self.workers) # Try to train one step. with self._timers[TRAINING_STEP_TIMER]: - results = self.training_step() + # TODO (sven): Should we reduce the different + # `training_step_results` over time with MetricsLogger. + training_step_results = self.training_step() + + if training_step_results: + results = training_step_results return results, train_iter_ctx @@ -3172,7 +3193,6 @@ def _run_one_evaluation( Returns: The results dict from the evaluation call. """ - if self.evaluation_workers is not None: with self._timers[RESTORE_EVAL_WORKERS_TIMER]: self.restore_workers(self.evaluation_workers) @@ -3493,7 +3513,9 @@ def _compile_iteration_results_old_and_hybrid_api_stacks( ) results["num_healthy_workers"] = self.workers.num_healthy_remote_workers() - results["num_in_flight_async_reqs"] = self.workers.num_in_flight_async_reqs() + results[ + "num_in_flight_async_sample_reqs" + ] = self.workers.num_in_flight_async_reqs() results[ "num_remote_worker_restarts" ] = self.workers.num_remote_worker_restarts() diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 5529ebed8be8..dccf062a22fb 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -520,6 +520,7 @@ def __init__(self, algo_class: Optional[type] = None): self._disable_preprocessor_api = False self._disable_action_flattening = False self._disable_initialize_loss_from_dummy_batch = False + self._dont_auto_sync_env_runner_states = False # Has this config object been frozen (cannot alter its attributes anymore). self._is_frozen = False @@ -928,7 +929,7 @@ def build_env_to_module_connector(self, env): ) ) # Batch all data. - pipeline.append(BatchIndividualItems()) + pipeline.append(BatchIndividualItems(multi_agent=self.is_multi_agent())) # Convert to Tensors. pipeline.append(NumpyToTensor()) @@ -1011,7 +1012,12 @@ def build_module_to_env_connector(self, env): return pipeline - def build_learner_connector(self, input_observation_space, input_action_space): + def build_learner_connector( + self, + input_observation_space, + input_action_space, + device=None, + ): from ray.rllib.connectors.learner import ( AddColumnsFromEpisodesToTrainBatch, AddObservationsFromEpisodesToBatch, @@ -1019,13 +1025,18 @@ def build_learner_connector(self, input_observation_space, input_action_space): AgentToModuleMapping, BatchIndividualItems, LearnerConnectorPipeline, + NumpyToTensor, ) custom_connectors = [] # Create a learner connector pipeline (including RLlib's default # learner connector piece) and return it. if self._learner_connector is not None: - val_ = self._learner_connector(input_observation_space, input_action_space) + val_ = self._learner_connector( + input_observation_space, + input_action_space, + # device, # TODO (sven): Also pass device into custom builder. + ) from ray.rllib.connectors.connector_v2 import ConnectorV2 @@ -1074,7 +1085,9 @@ def build_learner_connector(self, input_observation_space, input_action_space): ) ) # Batch all data. - pipeline.append(BatchIndividualItems()) + pipeline.append(BatchIndividualItems(multi_agent=self.is_multi_agent())) + # Convert to Tensors. + pipeline.append(NumpyToTensor(as_learner_connector=True, device=device)) return pipeline def build_learner_group( @@ -3130,7 +3143,10 @@ def is_atari(self) -> bool: @property def total_train_batch_size(self): - if self.train_batch_size_per_learner is not None: + if ( + self.train_batch_size_per_learner is not None + and self.enable_rl_module_and_learner + ): return self.train_batch_size_per_learner * (self.num_learners or 1) else: return self.train_batch_size diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index 5228e6da3771..482801af88ae 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -24,7 +24,7 @@ NUM_ENV_STEPS_SAMPLED, NUM_TARGET_UPDATES, ) -from ray.rllib.utils.metrics import ALL_MODULES, LEARNER_STATS_KEY +from ray.rllib.utils.metrics import LEARNER_STATS_KEY from ray.rllib.utils.typing import ( ResultDict, ) @@ -277,19 +277,13 @@ def __init__(self, config, *args, **kwargs): lambda p, _: p.update_target() ) - def after_train_step(self, train_results: ResultDict) -> None: - """Updates the target network and the KL coefficient for the APPO-loss. - - This method is called from within the `training_step` method after each train - update. - The target network update frequency is calculated automatically by the product - of `num_sgd_iter` setting (usually 1 for APPO) and `minibatch_buffer_size`. - - Args: - train_results: The results dict collected during the most recent - training step. - """ + @override(Impala) + def training_step(self) -> ResultDict: + train_results = super().training_step() + # Update the target network and the KL coefficient for the APPO-loss. + # The target network update frequency is calculated automatically by the product + # of `num_sgd_iter` setting (usually 1 for APPO) and `minibatch_buffer_size`. if self.config.enable_rl_module_and_learner: if NUM_TARGET_UPDATES in train_results: self._counters[NUM_TARGET_UPDATES] += train_results[NUM_TARGET_UPDATES] @@ -341,24 +335,6 @@ def update(pi, pi_id): # Worker. self.workers.local_worker().foreach_policy_to_train(update) - @override(Impala) - def _get_additional_update_kwargs(self, train_results) -> dict: - return dict( - last_update=self._counters[LAST_TARGET_UPDATE_TS], - mean_kl_loss_per_module={ - module_id: r[LEARNER_RESULTS_KL_KEY] - for module_id, r in train_results.items() - if module_id != ALL_MODULES - }, - ) - - @override(Impala) - def training_step(self) -> ResultDict: - train_results = super().training_step() - - # Update KL, target network periodically. - self.after_train_step(train_results) - return train_results @classmethod diff --git a/rllib/algorithms/appo/appo_learner.py b/rllib/algorithms/appo/appo_learner.py index 72b359eaf7d5..27ae79bf04d6 100644 --- a/rllib/algorithms/appo/appo_learner.py +++ b/rllib/algorithms/appo/appo_learner.py @@ -51,6 +51,16 @@ def additional_update_for_module( Args: module_id: """ + + # return dict( + # last_update=self._counters[LAST_TARGET_UPDATE_TS], + # mean_kl_loss_per_module={ + # module_id: r[LEARNER_RESULTS_KL_KEY] + # for module_id, r in train_results.items() + # if module_id != ALL_MODULES + # }, + # ) + # TODO (avnish) Using steps trained here instead of sampled ... I'm not sure # why the other implementation uses sampled. # The difference in steps sampled/trained is pretty diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index c39d09f3a989..9129dde30f82 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -82,15 +82,10 @@ def __init__( # First thing first, enable eager execution if necessary. base.enable_eager_execution_if_necessary() - # If Learner API is used, we don't need any loss-specific mixins. - # However, we also would like to avoid creating special Policy-subclasses - # for this as the entire Policy concept will soon not be used anymore with - # the new Learner- and RLModule APIs. - if not config.get("enable_rl_module_and_learner", False): - # Although this is a no-op, we call __init__ here to make it clear - # that base.__init__ will use the make_model() call. - VTraceClipGradients.__init__(self) - VTraceOptimizer.__init__(self) + # Although this is a no-op, we call __init__ here to make it clear + # that base.__init__ will use the make_model() call. + VTraceClipGradients.__init__(self) + VTraceOptimizer.__init__(self) # Initialize base class. base.__init__( @@ -111,8 +106,7 @@ def __init__( ValueNetworkMixin.__init__(self, config) KLCoeffMixin.__init__(self, config) - if not config.get("enable_rl_module_and_learner", False): - GradStatsMixin.__init__(self) + GradStatsMixin.__init__(self) # Note: this is a bit ugly, but loss and optimizer initialization must # happen after all the MixIns are initialized. diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index c62e01941ca3..56ab8f11267e 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -70,14 +70,9 @@ class APPOTorchPolicy( def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.algorithms.appo.appo.APPOConfig().to_dict(), **config) - # If Learner API is used, we don't need any loss-specific mixins. - # However, we also would like to avoid creating special Policy-subclasses - # for this as the entire Policy concept will soon not be used anymore with - # the new Learner- and RLModule APIs. - if not config.get("enable_rl_module_and_learner", False): - # Although this is a no-op, we call __init__ here to make it clear - # that base.__init__ will use the make_model() call. - VTraceOptimizer.__init__(self) + # Although this is a no-op, we call __init__ here to make it clear + # that base.__init__ will use the make_model() call. + VTraceOptimizer.__init__(self) lr_schedule_additional_args = [] if config.get("_separate_vf_optimizer"): diff --git a/rllib/algorithms/appo/tests/test_appo_learner.py b/rllib/algorithms/appo/tests/test_appo_learner.py index c8ab1dedd0db..c4cc5c92a4a6 100644 --- a/rllib/algorithms/appo/tests/test_appo_learner.py +++ b/rllib/algorithms/appo/tests/test_appo_learner.py @@ -11,16 +11,10 @@ from ray.rllib.core import DEFAULT_MODULE_ID from ray.rllib.core.columns import Columns from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.metrics.learner_info import LEARNER_INFO -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.test_utils import framework_iterator +from ray.rllib.utils.metrics import LEARNER_RESULTS from ray.rllib.utils.torch_utils import convert_to_torch_tensor -tf1, tf, _ = try_import_tf() - -tf1.enable_eager_execution() - frag_length = 50 FAKE_BATCH = { @@ -35,9 +29,6 @@ [False for _ in range(frag_length - 1)] + [True] ).astype(np.float32), Columns.VF_PREDS: np.array(list(reversed(range(frag_length))), dtype=np.float32), - Columns.VALUES_BOOTSTRAPPED: np.array( - list(reversed(range(frag_length))), dtype=np.float32 - ), Columns.ACTION_LOGP: np.log( np.random.uniform(low=0, high=1, size=(frag_length,)) ).astype(np.float32), @@ -57,7 +48,10 @@ def test_appo_loss(self): """Test that appo_policy_rlm loss matches the appo learner loss.""" config = ( appo.APPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("CartPole-v1") .env_runners( num_env_runners=0, @@ -77,35 +71,31 @@ def test_appo_loss(self): # config.env_runners() only deep-updates it config.exploration_config = {} - for fw in framework_iterator(config, frameworks=("torch", "tf2")): - algo = config.build() + algo = config.build() - if fw == "tf2": - train_batch = SampleBatch( - tree.map_structure(lambda x: tf.convert_to_tensor(x), FAKE_BATCH) - ) - else: - train_batch = SampleBatch( - tree.map_structure(lambda x: convert_to_torch_tensor(x), FAKE_BATCH) - ) + train_batch = SampleBatch( + tree.map_structure(lambda x: convert_to_torch_tensor(x), FAKE_BATCH) + ) - algo_config = config.copy(copy_frozen=False) - algo_config.learners(num_learners=0) - algo_config.validate() + algo_config = config.copy(copy_frozen=False) + algo_config.learners(num_learners=0) + algo_config.validate() - learner_group = algo_config.build_learner_group( - env=algo.workers.local_worker().env - ) - learner_group.set_weights(algo.get_weights()) - learner_group.update_from_batch(batch=train_batch.as_multi_agent()) + learner_group = algo_config.build_learner_group( + env=algo.workers.local_worker().env + ) + learner_group.update_from_batch(batch=train_batch.as_multi_agent()) - algo.stop() + algo.stop() def test_kl_coeff_changes(self): initial_kl_coeff = 0.01 config = ( appo.APPOConfig() - .api_stack(enable_rl_module_and_learner=True) + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) .environment("CartPole-v1") # Asynchronous Algo, make sure we have some results after 1 iteration. .reporting(min_time_s_per_iteration=10) @@ -126,18 +116,18 @@ def test_kl_coeff_changes(self): kl_coeff=initial_kl_coeff, ) ) - for _ in framework_iterator(config, frameworks=("torch", "tf2")): - algo = config.build() - # Call train while results aren't returned because this is - # a asynchronous algorithm and results are returned asynchronously. - while True: - results = algo.train() - if results.get("info", {}).get(LEARNER_INFO, {}).get(DEFAULT_MODULE_ID): - break - curr_kl_coeff = results["info"][LEARNER_INFO][DEFAULT_MODULE_ID][ - LEARNER_RESULTS_CURR_KL_COEFF_KEY - ] - self.assertNotEqual(curr_kl_coeff, initial_kl_coeff) + algo = config.build() + # Call train while results aren't returned because this is + # a asynchronous algorithm and results are returned asynchronously. + while True: + results = algo.train() + print(results) + if results.get(LEARNER_RESULTS, {}).get(DEFAULT_MODULE_ID): + break + curr_kl_coeff = results[LEARNER_RESULTS][DEFAULT_MODULE_ID][ + LEARNER_RESULTS_CURR_KL_COEFF_KEY + ] + self.assertNotEqual(curr_kl_coeff, initial_kl_coeff) if __name__ == "__main__": diff --git a/rllib/algorithms/appo/tests/test_appo_off_policyness.py b/rllib/algorithms/appo/tests/test_appo_off_policyness.py deleted file mode 100644 index b5df78ba809d..000000000000 --- a/rllib/algorithms/appo/tests/test_appo_off_policyness.py +++ /dev/null @@ -1,55 +0,0 @@ -import unittest - -import ray -import ray.rllib.algorithms.appo as appo -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.test_utils import ( - check_compute_single_action, - check_off_policyness, - framework_iterator, -) - -tf1, tf, tfv = try_import_tf() - - -class TestAPPOOffPolicyNess(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init(num_gpus=1) - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_appo_off_policyness(self): - config = ( - appo.APPOConfig() - .environment("CartPole-v1") - .resources(num_gpus=1) - .env_runners(num_env_runners=4) - ) - num_iterations = 3 - - for _ in framework_iterator(config): - for num_aggregation_workers in [0, 1]: - config.num_aggregation_workers = num_aggregation_workers - print("aggregation-workers={}".format(config.num_aggregation_workers)) - algo = config.build() - for i in range(num_iterations): - results = algo.train() - # Roughly: Reaches up to 0.4 for 2 rollout workers and up to 0.2 for - # 1 rollout worker. - off_policy_ness = check_off_policyness(results, upper_limit=2.0) - print(f"off-policy'ness={off_policy_ness}") - - check_compute_single_action( - algo, - ) - algo.stop() - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py index 6db878a123c1..27033724d152 100644 --- a/rllib/algorithms/appo/torch/appo_torch_learner.py +++ b/rllib/algorithms/appo/torch/appo_torch_learner.py @@ -87,15 +87,19 @@ def compute_loss_for_module( trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - if self.config.enable_env_runner_and_connector_v2: - bootstrap_values = batch[Columns.VALUES_BOOTSTRAPPED] - else: - bootstrap_values_time_major = make_time_major( - batch[Columns.VALUES_BOOTSTRAPPED], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=recurrent_seq_len, - ) - bootstrap_values = bootstrap_values_time_major[-1] + assert Columns.VALUES_BOOTSTRAPPED not in batch + # Use as bootstrap values the vf-preds in the next "batch row", except + # for the very last row (which doesn't have a next row), for which the + # bootstrap value does not matter b/c it has a +1ts value at its end + # anyways. So we chose an arbitrary item (for simplicity of not having to + # move new data to the device). + bootstrap_values = torch.cat( + [ + values_time_major[0][1:], # 0th ts values from "next row" + values_time_major[0][0:1], # <- can use any arbitrary value here + ], + dim=0, + ) # The discount factor that is used should be gamma except for timesteps where # the episode is terminated. In that case, the discount factor should be 0. diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index b506dcf546aa..2b25afe69a45 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -27,35 +27,44 @@ RemoteCallResults, ) from ray.rllib.utils.actors import create_colocated_actors -from ray.rllib.utils.annotations import override -from ray.rllib.utils.metrics import ALL_MODULES -from ray.rllib.utils.deprecation import ( - DEPRECATED_VALUE, - deprecation_warning, -) +from ray.rllib.utils.annotations import OldAPIStack, override from ray.rllib.utils.metrics import ( + ALL_MODULES, + ENV_RUNNER_RESULTS, + LEARNER_GROUP, + LEARNER_RESULTS, + LEARNER_UPDATE_TIMER, + MEAN_NUM_EPISODE_LISTS_RECEIVED, + MEAN_NUM_LEARNER_GROUP_UPDATE_CALLED, NUM_AGENT_STEPS_SAMPLED, + NUM_AGENT_STEPS_SAMPLED_LIFETIME, NUM_AGENT_STEPS_TRAINED, NUM_ENV_STEPS_SAMPLED, NUM_ENV_STEPS_SAMPLED_LIFETIME, NUM_ENV_STEPS_TRAINED, + NUM_ENV_STEPS_TRAINED_LIFETIME, + NUM_EPISODES, + NUM_EPISODES_LIFETIME, NUM_MODULE_STEPS_TRAINED, NUM_SYNCH_WORKER_WEIGHTS, NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS, SYNCH_WORKER_WEIGHTS_TIMER, SAMPLE_TIMER, + TIMERS, ) from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import ReplayMode from ray.rllib.utils.replay_buffers.replay_buffer import _ALL_POLICIES from ray.rllib.utils.schedules.scheduler import Scheduler from ray.rllib.utils.typing import ( + LearningRateOrSchedule, PartialAlgorithmConfigDict, PolicyID, ResultDict, SampleBatchType, ) from ray.tune.execution.placement_groups import PlacementGroupFactory +from ray.util.annotations import DeveloperAPI logger = logging.getLogger(__name__) @@ -118,19 +127,23 @@ def __init__(self, algo_class=None): self.vtrace = True self.vtrace_clip_rho_threshold = 1.0 self.vtrace_clip_pg_rho_threshold = 1.0 - self.num_multi_gpu_tower_stacks = 1 - self.minibatch_buffer_size = 1 + self.num_multi_gpu_tower_stacks = 1 # @OldAPIstack + self.minibatch_buffer_size = 1 # @OldAPIstack self.num_sgd_iter = 1 - self.replay_proportion = 0.0 - self.replay_buffer_num_slots = 0 - self.learner_queue_size = 16 - self.learner_queue_timeout = 300 + self.replay_proportion = 0.0 # @OldAPIstack + self.replay_buffer_num_slots = 0 # @OldAPIstack + self.learner_queue_size = 3 + self.learner_queue_timeout = 300 # @OldAPIstack + self.max_requests_in_flight_per_sampler_worker = 2 self.max_requests_in_flight_per_aggregator_worker = 2 self.timeout_s_sampler_manager = 0.0 self.timeout_s_aggregator_manager = 0.0 self.broadcast_interval = 1 self.num_aggregation_workers = 0 - self.num_gpu_loader_threads = 16 + self.num_gpu_loader_threads = 8 + # Impala takes care of its own EnvRunner (weights, connector, counters) + # synching. + self._dont_auto_sync_env_runner_states = True self.grad_clip = 40.0 # Note: Only when using enable_rl_module_and_learner=True can the clipping mode @@ -138,28 +151,29 @@ def __init__(self, algo_class=None): # global_norm, no matter the value of `grad_clip_by`. self.grad_clip_by = "global_norm" - self.opt_type = "adam" + self.opt_type = "adam" # @OldAPIstack self.lr_schedule = None - self.decay = 0.99 - self.momentum = 0.0 - self.epsilon = 0.1 + self.decay = 0.99 # @OldAPIstack + self.momentum = 0.0 # @OldAPIstack + self.epsilon = 0.1 # @OldAPIstack self.vf_loss_coeff = 0.5 self.entropy_coeff = 0.01 self.entropy_coeff_schedule = None - self._separate_vf_optimizer = False - self._lr_vf = 0.0005 + self._separate_vf_optimizer = False # @OldAPIstack + self._lr_vf = 0.0005 # @OldAPIstack self.after_train_step = None # Override some of AlgorithmConfig's default values with IMPALA-specific values. self.rollout_fragment_length = 50 - self.train_batch_size = 500 + self.train_batch_size = 500 # @OldAPIstack + self.train_batch_size_per_learner = 500 self._minibatch_size = "auto" self.num_env_runners = 2 - self.num_gpus = 1 + self.num_gpus = 1 # @OldAPIstack self.lr = 0.0005 self.min_time_s_per_iteration = 10 - self._tf_policy_handles_more_than_one_loss = True - self.exploration_config = { + self._tf_policy_handles_more_than_one_loss = True # @OldAPIstack + self.exploration_config = { # @OldAPIstack # The Exploration class to use. In the simplest case, this is the name # (str) of any class present in the `rllib.utils.exploration` package. # You can also provide the python class directly or the full location @@ -171,9 +185,6 @@ def __init__(self, algo_class=None): # __sphinx_doc_end__ # fmt: on - # Deprecated value. - self.num_data_loader_buffers = DEPRECATED_VALUE - @override(AlgorithmConfig) def training( self, @@ -182,6 +193,7 @@ def training( vtrace_clip_rho_threshold: Optional[float] = NotProvided, vtrace_clip_pg_rho_threshold: Optional[float] = NotProvided, gamma: Optional[float] = NotProvided, + num_gpu_loader_threads: Optional[int] = NotProvided, num_multi_gpu_tower_stacks: Optional[int] = NotProvided, minibatch_buffer_size: Optional[int] = NotProvided, minibatch_size: Optional[Union[int, str]] = NotProvided, @@ -195,7 +207,6 @@ def training( timeout_s_aggregator_manager: Optional[float] = NotProvided, broadcast_interval: Optional[int] = NotProvided, num_aggregation_workers: Optional[int] = NotProvided, - num_gpu_loader_threads: Optional[int] = NotProvided, grad_clip: Optional[float] = NotProvided, opt_type: Optional[str] = NotProvided, lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, @@ -203,7 +214,7 @@ def training( momentum: Optional[float] = NotProvided, epsilon: Optional[float] = NotProvided, vf_loss_coeff: Optional[float] = NotProvided, - entropy_coeff: Optional[float] = NotProvided, + entropy_coeff: Optional[LearningRateOrSchedule] = NotProvided, entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, _separate_vf_optimizer: Optional[bool] = NotProvided, _lr_vf: Optional[float] = NotProvided, @@ -217,6 +228,12 @@ def training( vtrace_clip_rho_threshold: vtrace_clip_pg_rho_threshold: gamma: Float specifying the discount factor of the Markov Decision process. + num_gpu_loader_threads: The number of GPU-loader threads (per Learner + worker), used to load incoming (CPU) batches to the GPU, if applicable. + The incoming batches are produced by each Learner's LearnerConnector + pipeline. After loading the batches on the GPU, the threads place them + on yet another queue for the Learner thread (only one per Learner + worker) to pick up and perform `forward_train/loss` computations. num_multi_gpu_tower_stacks: For each stack of multi-GPU towers, how many slots should we reserve for parallel data loading? Set this to >1 to load data into GPUs in parallel. This will increase GPU memory usage @@ -264,8 +281,6 @@ def training( (`num_env_runners`). Note that n should be much smaller than m. This can make sense if ingesting >2GB/s of samples, or if the data requires decompression. - num_gpu_loader_threads: The number of GPU loader threads to use for loading - train-ready batches to the GPU(s). grad_clip: If specified, clip the global norm of gradients by this amount. opt_type: Either "adam" or "rmsprop". lr_schedule: Learning rate schedule. In the format of @@ -301,6 +316,10 @@ def training( self.vtrace_clip_rho_threshold = vtrace_clip_rho_threshold if vtrace_clip_pg_rho_threshold is not NotProvided: self.vtrace_clip_pg_rho_threshold = vtrace_clip_pg_rho_threshold + if gamma is not NotProvided: + self.gamma = gamma + if num_gpu_loader_threads is not NotProvided: + self.num_gpu_loader_threads = num_gpu_loader_threads if num_multi_gpu_tower_stacks is not NotProvided: self.num_multi_gpu_tower_stacks = num_multi_gpu_tower_stacks if minibatch_buffer_size is not NotProvided: @@ -327,8 +346,6 @@ def training( self.timeout_s_sampler_manager = timeout_s_sampler_manager if timeout_s_aggregator_manager is not NotProvided: self.timeout_s_aggregator_manager = timeout_s_aggregator_manager - if num_gpu_loader_threads is not NotProvided: - self.num_gpu_loader_threads = num_gpu_loader_threads if grad_clip is not NotProvided: self.grad_clip = grad_clip if opt_type is not NotProvided: @@ -353,8 +370,6 @@ def training( self._lr_vf = _lr_vf if after_train_step is not NotProvided: self.after_train_step = after_train_step - if gamma is not NotProvided: - self.gamma = gamma if minibatch_size is not NotProvided: self._minibatch_size = minibatch_size @@ -375,13 +390,6 @@ def validate(self) -> None: # New stack w/ EnvRunners does NOT support aggregation workers yet or a mixin # replay buffer. if self.enable_env_runner_and_connector_v2: - if self.num_aggregation_workers > 0: - raise ValueError( - "Aggregation workers not supported on new API stack w/ new " - "EnvRunner API! Set `config.num_aggregation_workers = 0` or " - "disable the new API stack via " - "`config.api_stack(enable_rl_module_and_learner=False)`." - ) if self.replay_ratio != 0.0: raise ValueError( "The new API stack in combination with the new EnvRunner API " @@ -389,13 +397,16 @@ def validate(self) -> None: f"{self} (set `config.replay_proportion` to 0.0)!" ) - if self.num_data_loader_buffers != DEPRECATED_VALUE: - deprecation_warning( - "num_data_loader_buffers", "num_multi_gpu_tower_stacks", error=True - ) - # Entropy coeff schedule checking. if self.enable_rl_module_and_learner: + if not self.enable_env_runner_and_connector_v2: + raise ValueError( + "Setting `enable_rl_module_and_learner` to True and " + "`enable_env_runner_and_connector_v2` to False ('hybrid API stack'" + ") is not longer supported! Set both to True or both to False, " + "instead." + ) + if self.entropy_coeff_schedule is not None: raise ValueError( "`entropy_coeff_schedule` is deprecated and must be None! Use the " @@ -418,8 +429,8 @@ def validate(self) -> None: elif self.num_aggregation_workers > self.num_env_runners / 2: logger.warning( "`num_aggregation_workers` should be significantly smaller " - "than `num_workers`! Try setting it to 0.5*`num_workers` or " - "less." + "than `num_env_runners`! Try setting it to 0.5*`num_env_runners`" + " or less." ) # If two separate optimizers/loss terms used for tf, must also set @@ -516,53 +527,11 @@ def get_default_rl_module_spec(self) -> SingleAgentRLModuleSpec: ) -def make_learner_thread(local_worker, config): - if not config["simple_optimizer"]: - logger.info( - "Enabling multi-GPU mode, {} GPUs, {} parallel tower-stacks".format( - config["num_gpus"], config["num_multi_gpu_tower_stacks"] - ) - ) - num_stacks = config["num_multi_gpu_tower_stacks"] - buffer_size = config["minibatch_buffer_size"] - if num_stacks < buffer_size: - logger.warning( - "In multi-GPU mode you should have at least as many " - "multi-GPU tower stacks (to load data into on one device) as " - "you have stack-index slots in the buffer! You have " - f"configured {num_stacks} stacks and a buffer of size " - f"{buffer_size}. Setting " - f"`minibatch_buffer_size={num_stacks}`." - ) - config["minibatch_buffer_size"] = num_stacks - - learner_thread = MultiGPULearnerThread( - local_worker, - num_gpus=config["num_gpus"], - lr=config["lr"], - train_batch_size=config["train_batch_size"], - num_multi_gpu_tower_stacks=config["num_multi_gpu_tower_stacks"], - num_sgd_iter=config["num_sgd_iter"], - learner_queue_size=config["learner_queue_size"], - learner_queue_timeout=config["learner_queue_timeout"], - num_data_load_threads=config["num_gpu_loader_threads"], - ) - else: - learner_thread = LearnerThread( - local_worker, - minibatch_buffer_size=config["minibatch_buffer_size"], - num_sgd_iter=config["num_sgd_iter"], - learner_queue_size=config["learner_queue_size"], - learner_queue_timeout=config["learner_queue_timeout"], - ) - return learner_thread - - class Impala(Algorithm): """Importance weighted actor/learner architecture (IMPALA) Algorithm == Overview of data flow in IMPALA == - 1. Policy evaluation in parallel across `num_workers` actors produces + 1. Policy evaluation in parallel across `num_env_runners` actors produces batches of size `rollout_fragment_length * num_envs_per_env_runner`. 2. If enabled, the replay buffer stores and produces batches of size `rollout_fragment_length * num_envs_per_env_runner`. @@ -606,8 +575,8 @@ def get_default_policy_class( def setup(self, config: AlgorithmConfig): super().setup(config) - # Queue of batches to be sent to the Learner. - self.batches_to_place_on_learner = [] + # Queue of data to be sent to the Learner. + self.data_to_place_on_learner = [] # The local mixin buffer (if required). self.local_mixin_buffer = None @@ -628,7 +597,9 @@ def setup(self, config: AlgorithmConfig): actor_specs=[ # (class, args, kwargs={}, count=1) ( - AggregatorWorker, + AggregationWorker + if self.config.enable_env_runner_and_connector_v2 + else AggregatorWorker_OldAPIStack, [ self.config, ], @@ -647,9 +618,8 @@ def setup(self, config: AlgorithmConfig): self.config.max_requests_in_flight_per_aggregator_worker ), ) - self._timeout_s_aggregator_manager = ( - self.config.timeout_s_aggregator_manager - ) + elif self.config.enable_rl_module_and_learner: + self._aggregator_actor_manager = None else: # Create our local mixin buffer if the num of aggregation workers is 0. if self.config.replay_proportion > 0.0: @@ -677,109 +647,292 @@ def setup(self, config: AlgorithmConfig): @override(Algorithm) def training_step(self) -> ResultDict: - # First, check, whether our learner thread is still healthy. - if ( - not self.config.enable_rl_module_and_learner - and not self._learner_thread.is_alive() - ): - raise RuntimeError("The learner thread died while training!") + # Old- and hybrid API stacks. + if not self.config.enable_rl_module_and_learner: + return self._training_step_old_api_stack() - use_tree_aggregation = ( - self._aggregator_actor_manager - and self._aggregator_actor_manager.num_healthy_actors() > 0 - ) + do_async_updates = self.config.num_learner_workers > 0 - # Get sampled SampleBatches from our workers (by ray references if we use - # tree-aggregation). - unprocessed_sample_batches = self.get_samples_from_workers( - return_object_refs=use_tree_aggregation, - ) - # Tag workers that actually produced ready sample batches this iteration. - # Those workers will have to get updated at the end of the iteration. - workers_that_need_updates = { - worker_id for worker_id, _ in unprocessed_sample_batches - } + # Asynchronously request all EnvRunners to sample and return their current + # (e.g. ConnectorV2) states and sampling metrics/stats. + # Note that each item in `episode_refs` is a reference to a list of Episodes. + with self.metrics.log_time((TIMERS, SAMPLE_TIMER)): + ( + episode_refs, + connector_states, + env_runner_metrics, + env_runner_indices_to_update, + ) = self._sample_and_get_connector_states() + # Reduce EnvRunner metrics over the n EnvRunners. + self.metrics.merge_and_log_n_dicts( + env_runner_metrics, key=ENV_RUNNER_RESULTS + ) - # Send the collected batches (still object refs) to our aggregation workers. - if use_tree_aggregation: - batches = self.process_experiences_tree_aggregation( - unprocessed_sample_batches + # Log the average number of sample results (list of episodes) received. + self.metrics.log_value(MEAN_NUM_EPISODE_LISTS_RECEIVED, len(episode_refs)) + self.metrics.log_value( + "_mean_num_episode_ts_received", + len(episode_refs) + * self.config.num_envs_per_env_runner + * self.config.get_rollout_fragment_length(), + ) + self.metrics.log_value( + "_mean_num_episode_ts_received_using_reduced_metrics", + self.metrics.peek( + (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED), default=0 + ), ) - # Resolve collected batches here on local process (using the mixin buffer). - else: - batches = self.process_experiences_directly(unprocessed_sample_batches) - # Increase sampling counters now that we have the actual SampleBatches on - # the local process (and can measure their sizes). - for batch in batches: - self._counters[NUM_ENV_STEPS_SAMPLED] += batch.count - self._counters[NUM_AGENT_STEPS_SAMPLED] += batch.agent_steps() - # Concatenate single batches into batches of size `train_batch_size`. - self.concatenate_batches_and_pre_queue(batches) - # Using the Learner API. Call `update()` on our LearnerGroup object with - # all collected batches. - if self.config.enable_rl_module_and_learner: - train_results = self.learn_on_processed_samples() - module_ids_to_update = set(train_results.keys()) - {ALL_MODULES} - # TODO (sven): Move to Learner._after_gradient_based_update(). - additional_results = self.learner_group.additional_update( - module_ids_to_update=module_ids_to_update, - timestep=self._counters[ - NUM_ENV_STEPS_TRAINED - if self.config.count_steps_by == "env_steps" - else NUM_AGENT_STEPS_TRAINED - ], - # TODO (sven): Feels hacked, but solves the problem of algos inheriting - # from IMPALA (like APPO). In the old stack, we didn't have this - # problem b/c IMPALA didn't need to call any additional update methods - # as the entropy- and lr-schedules were handled by - # `Policy.on_global_var_update()`. - **self._get_additional_update_kwargs(train_results), + # Log lifetime counts for env- and agent steps. + if env_runner_metrics: + self.metrics.log_dict( + { + NUM_AGENT_STEPS_SAMPLED_LIFETIME: self.metrics.peek( + (ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED) + ), + NUM_ENV_STEPS_SAMPLED_LIFETIME: self.metrics.peek( + (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED) + ), + NUM_EPISODES_LIFETIME: self.metrics.peek( + (ENV_RUNNER_RESULTS, NUM_EPISODES) + ), + }, + reduce="sum", ) - for key, res in additional_results.items(): - if key in train_results: - train_results[key].update(res) - else: - # Move train batches (of size `train_batch_size`) onto learner queue. - self.place_processed_samples_on_learner_thread_queue() - # Extract most recent train results from learner thread. - train_results = self.process_trained_results() - # Sync worker weights (only those policies that were actually updated). - with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: - if self.config.enable_rl_module_and_learner: - if train_results: - pids = list(set(train_results.keys()) - {ALL_MODULES}) - self.update_workers_from_learner_group( - workers_that_need_updates=workers_that_need_updates, - policy_ids=pids, - ) - else: - pids = list(train_results.keys()) - self.update_workers_if_necessary( - workers_that_need_updates=workers_that_need_updates, - policy_ids=pids, + # "Batch" collected episode refs into groups, such that exactly + # `total_train_batch_size` timesteps are sent to + # `LearnerGroup.update_from_episodes()`. + data_packages_for_learner_group = self._pre_queue_episode_refs(episode_refs) + # If we do tree aggregation, we perform the LearnerConnector pass on the + # aggregation workers. + if self.config.num_aggregation_workers: + data_packages_for_learner_group = ( + self._process_env_runner_data_via_aggregation( + data_packages_for_learner_group ) + ) - # With a training step done, try to bring any aggregators back to life - # if necessary. - # Aggregation workers are stateless, so we do not need to restore any - # state here. - if self._aggregator_actor_manager: - self._aggregator_actor_manager.probe_unhealthy_actors( - timeout_seconds=self.config.env_runner_health_probe_timeout_s, + # Call the LearnerGroup's `update_from_episodes` method. + with self.metrics.log_time((TIMERS, LEARNER_UPDATE_TIMER)): + self.metrics.log_value( + key=MEAN_NUM_LEARNER_GROUP_UPDATE_CALLED, + value=len(data_packages_for_learner_group), ) + rl_module_state = None + last_good_learner_results = None + + for batch_ref_or_episode_list_ref in data_packages_for_learner_group: + if self.config.num_aggregation_workers: + learner_results = self.learner_group.update_from_batch( + batch=batch_ref_or_episode_list_ref, + async_update=do_async_updates, + return_state=True, + timesteps={ + NUM_ENV_STEPS_SAMPLED_LIFETIME: self.metrics.peek( + NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0 + ), + }, + ) + else: + learner_results = self.learner_group.update_from_episodes( + episodes=batch_ref_or_episode_list_ref, + async_update=do_async_updates, + return_state=True, + timesteps={ + NUM_ENV_STEPS_SAMPLED_LIFETIME: self.metrics.peek( + NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0 + ), + }, + ) + if not do_async_updates: + learner_results = [learner_results] + for results_from_n_learners in learner_results: + for r in results_from_n_learners: + rl_module_state = r.pop( + "_rl_module_state_after_update", rl_module_state + ) + self.metrics.merge_and_log_n_dicts( + stats_dicts=results_from_n_learners, + key=LEARNER_RESULTS, + ) + last_good_learner_results = results_from_n_learners + + # Update LearnerGroup's own stats. + self.metrics.log_dict(self.learner_group.get_stats(), key=LEARNER_GROUP) + self.metrics.log_value( + NUM_ENV_STEPS_TRAINED_LIFETIME, + self.metrics.peek( + (LEARNER_RESULTS, ALL_MODULES, NUM_ENV_STEPS_TRAINED), default=0 + ), + reduce="sum", + ) + # self.metrics.log_value(NUM_MODULE_STEPS_TRAINED_LIFETIME, self.metrics.peek( + # (LEARNER_RESULTS, NUM_MODULE_STEPS_TRAINED) + # ), reduce="sum") + + # Figure out, whether we should sync/broadcast the (remote) EnvRunner states. + # Note: `learner_results` is a List of n (num async calls) Lists of m + # (num Learner workers) ResultDicts each. + self.metrics.log_value( + NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS, 1, reduce="sum" + ) + if last_good_learner_results: + # Merge available EnvRunner states into local worker's EnvRunner state. + # Broadcast merged EnvRunner state AND new model weights back to all remote + # EnvRunners that - in this call - had returned samples. + if ( + self.metrics.peek( + NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS + ) + >= self.config.broadcast_interval + ): + self.metrics.set_value( + NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS, 0 + ) + self.metrics.log_value(NUM_SYNCH_WORKER_WEIGHTS, 1, reduce="sum") + with self.metrics.log_time((TIMERS, SYNCH_WORKER_WEIGHTS_TIMER)): + self.workers.sync_env_runner_states( + config=self.config, + env_runner_indices_to_update=env_runner_indices_to_update, + env_steps_sampled=self.metrics.peek( + NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0 + ), + connector_states=connector_states, + rl_module_state=rl_module_state, + ) + + if env_runner_metrics or last_good_learner_results: + return self.metrics.reduce() + return {} - if self.config.enable_rl_module_and_learner: - if train_results: - # Store the most recent result and return it if no new result is - # available. This keeps backwards compatibility with the old - # training stack / results reporting stack. This is necessary - # any time we develop an asynchronous algorithm. - self._results = train_results - return self._results + def _sample_and_get_connector_states(self): + def _remote_sample_get_state_and_metrics(_worker): + _episodes = _worker.sample() + # Get the EnvRunner's connector states. + _connector_states = _worker.get_state( + components=["env_to_module_connector", "module_to_env_connector"] + ) + _metrics = _worker.get_metrics() + # Return episode lists by reference so we don't have to send them to the + # main algo process, but to the Learner workers directly. + return ray.put(_episodes), _connector_states, _metrics + + env_runner_indices_to_update = set() + episode_refs = [] + connector_states = [] + env_runner_metrics = [] + num_healthy_remote_workers = self.workers.num_healthy_remote_workers() + + # Perform asynchronous sampling on all (healthy) remote rollout workers. + if num_healthy_remote_workers > 0: + self.workers.foreach_worker_async(_remote_sample_get_state_and_metrics) + async_results: List[ + Tuple[int, ObjectRef] + ] = self.workers.fetch_ready_async_reqs( + timeout_seconds=self.config.timeout_s_sampler_manager, + return_obj_refs=False, + ) + # Get results from the n different async calls and store those EnvRunner + # indices we should update. + results = [] + for r in async_results: + env_runner_indices_to_update.add(r[0]) + results.append(r[1]) + + for (episodes, states, metrics) in results: + episode_refs.append(episodes) + connector_states.append(states) + env_runner_metrics.append(metrics) + # Sample from the local EnvRunner. else: - return train_results + episodes = self.workers.local_worker().sample() + env_runner_metrics = [self.workers.local_worker().get_metrics()] + episode_refs = [ray.put(episodes)] + connector_states = [ + self.workers.local_worker().get_state( + components=["env_to_module_connector", "module_to_env_connector"] + ) + ] + + return ( + episode_refs, + connector_states, + env_runner_metrics, + list(env_runner_indices_to_update), + ) + + def _pre_queue_episode_refs( + self, episode_refs: List[ObjectRef] + ) -> List[List[ObjectRef]]: + # Each element in this list is itself a list of ObjRef[Episodes]. + # Each ObjRef was returned by one EnvRunner from a single sample() call. + episode_refs_for_learner_group: List[List[ObjectRef]] = [] + + for ref in episode_refs: + self.batch_being_built.append(ref) + if ( + len(self.batch_being_built) + * self.config.num_envs_per_env_runner + * self.config.get_rollout_fragment_length() + >= self.config.total_train_batch_size + ): + episode_refs_for_learner_group.append(self.batch_being_built) + self.batch_being_built = [] + + return episode_refs_for_learner_group + + def _process_env_runner_data_via_aggregation( + self, + learner_group_data_packages: List[List[ObjectRef]], + ) -> List[ObjectRef]: + """Process sample batches using tree aggregation workers. + + Args: + learner_group_data_packages: List of (env_runner_id, ObjectRef of EnvRunner- + returned data) + + NOTE: This will provide speedup when sample batches have been compressed, + and the decompression can happen on the aggregation workers in parallel to + the training. + + Returns: + Batches that have been processed by the mixin buffers on the aggregation + workers. + """ + + def _process_data(_actor, _episodes): + return _actor.process_episodes(ray.get(_episodes)) + + for data in learner_group_data_packages: + assert isinstance(data, ObjectRef), ( + "For efficiency, process_experiences_tree_aggregation should " + f"be given ObjectRefs instead of {type(data)}." + ) + # Randomly pick an aggregation worker to process this batch. + aggregator_id = random.choice( + self._aggregator_actor_manager.healthy_actor_ids() + ) + calls_placed = self._aggregator_actor_manager.foreach_actor_async( + partial(_process_data, _episodes=data), + remote_actor_ids=[aggregator_id], + ) + if calls_placed <= 0: + self.metrics.log_value( + "num_times_no_aggregation_worker_available", 1, reduce="sum" + ) + + waiting_processed_sample_batches: RemoteCallResults = ( + self._aggregator_actor_manager.fetch_ready_async_reqs( + timeout_seconds=self.config.timeout_s_aggregator_manager, + ) + ) + _handle_remote_call_result_errors( + waiting_processed_sample_batches, + self.config.ignore_env_runner_failures, + ) + + return list(waiting_processed_sample_batches.ignore_errors()) @classmethod @override(Algorithm) @@ -804,13 +957,17 @@ def default_resource_request( # from RolloutWorkers (n rollout workers map to m # aggregation workers, where m < n) and always use 1 CPU # each. - "CPU": cf.num_cpus_for_main_process + cf.num_aggregation_workers, + "CPU": max( + cf.num_cpus_for_main_process, + cf.num_cpus_per_learner if cf.num_learners == 0 else 0, + ) + + cf.num_aggregation_workers, "GPU": 0 if cf._fake_gpus else cf.num_gpus, } ] + [ { - # RolloutWorkers. + # EnvRunners. "CPU": cf.num_cpus_per_env_runner, "GPU": cf.num_gpus_per_env_runner, **cf.custom_resources_per_env_runner, @@ -835,11 +992,10 @@ def default_resource_request( ) # TODO (avnishn): Remove this once we have a way to extend placement group # factories. - if cf.enable_rl_module_and_learner: - # Resources for the Algorithm. - learner_bundles = cls._get_learner_bundles(cf) - - bundles += learner_bundles + # Only if we have actual (remote) learner workers. In case of a local learner, + # the resource has already been taken care of above. + if cf.enable_rl_module_and_learner and cf.num_learners > 0: + bundles += cls._get_learner_bundles(cf) # Return PlacementGroupFactory containing all needed resources # (already properly defined as device bundles). @@ -848,61 +1004,80 @@ def default_resource_request( strategy=cf.placement_strategy, ) - def concatenate_batches_and_pre_queue(self, batches: List[SampleBatch]) -> None: - """Concatenate batches that are being returned from rollout workers + @OldAPIStack + def _training_step_old_api_stack(self): + # First, check, whether our learner thread is still healthy. + if not self._learner_thread.is_alive(): + raise RuntimeError("The learner thread died while training!") - Args: - batches: List of batches of experiences from EnvRunners. - """ + use_tree_aggregation = ( + self._aggregator_actor_manager + and self._aggregator_actor_manager.num_healthy_actors() > 0 + ) - def aggregate_into_larger_batch(): - if ( - sum(b.count for b in self.batch_being_built) - >= self.config.total_train_batch_size - ): - batch_to_add = concat_samples(self.batch_being_built) - self.batches_to_place_on_learner.append(batch_to_add) - self.batch_being_built = [] + # Get sampled SampleBatches from our workers (by ray references if we use + # tree-aggregation). + unprocessed_sample_batches = ( + self._get_samples_from_workers_old_and_hybrid_api_stack( + return_object_refs=use_tree_aggregation, + ) + ) + # Tag workers that actually produced ready sample batches this iteration. + # Those workers will have to get updated at the end of the iteration. + workers_that_need_updates = { + worker_id for worker_id, _ in unprocessed_sample_batches + } - for batch in batches: - # TODO (sven): Strange bug after a RolloutWorker crash and proper - # restart. The bug is related to (old, non-V2) connectors being used and - # seems to happen inside the AgentCollector's `add_action_reward_next_obs` - # method, at the end of which the number of vf_preds (and all other - # extra action outs) in the batch is one smaller than the number of obs/ - # actions/rewards, which then leads to a malformed train batch. - # IMPALA/APPO crash inside the loss function (during v-trace operations) - # b/c of the resulting shape mismatch. The following if-block prevents - # this from happening and it can be removed once we are on the new API - # stack for good (and use the new connectors and also no longer - # AgentCollectors, RolloutWorkers, Policies, TrajectoryView API, etc..): - if ( - self.config.batch_mode == "truncate_episodes" - and self.config.enable_connectors - and self.config.recreate_failed_env_runners - ): - if any( - SampleBatch.VF_PREDS in pb - and ( - pb[SampleBatch.VF_PREDS].shape[0] - != pb[SampleBatch.REWARDS].shape[0] - ) - for pb in batch.policy_batches.values() - ): - continue + # Send the collected batches (still object refs) to our aggregation workers. + if use_tree_aggregation: + batches = self._process_experiences_tree_aggregation( + unprocessed_sample_batches + ) + # Resolve collected batches here on local process (using the mixin buffer). + else: + batches = self._process_experiences_directly(unprocessed_sample_batches) - self.batch_being_built.append(batch) - aggregate_into_larger_batch() + # Increase sampling counters now that we have the actual SampleBatches on + # the local process (and can measure their sizes). + for batch in batches: + self._counters[NUM_ENV_STEPS_SAMPLED] += batch.count + self._counters[NUM_AGENT_STEPS_SAMPLED] += batch.agent_steps() + # Concatenate single batches into batches of size `total_train_batch_size`. + self._concatenate_batches_and_pre_queue(batches) + # Move train batches (of size `total_train_batch_size`) onto learner queue. + self._place_processed_samples_on_learner_thread_queue() + # Extract most recent train results from learner thread. + train_results = self._process_trained_results() - def get_samples_from_workers( - self, - return_object_refs: Optional[bool] = False, - ) -> List[Tuple[int, Union[ObjectRef, SampleBatchType]]]: - """Get samples from rollout workers for training. + # Sync worker weights (only those policies that were actually updated). + with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: + pids = list(train_results.keys()) + self._update_workers_old_api_stack( + workers_that_need_updates=workers_that_need_updates, + policy_ids=pids, + ) - Args: - return_object_refs: If True, return ObjectRefs instead of the samples - directly. This is useful when using aggregator workers so that data + # With a training step done, try to bring any aggregators back to life + # if necessary. + # Aggregation workers are stateless, so we do not need to restore any + # state here. + if self._aggregator_actor_manager: + self._aggregator_actor_manager.probe_unhealthy_actors( + timeout_seconds=self.config.env_runner_health_probe_timeout_s, + ) + + return train_results + + @OldAPIStack + def _get_samples_from_workers_old_and_hybrid_api_stack( + self, + return_object_refs: Optional[bool] = False, + ) -> List[Tuple[int, Union[ObjectRef, SampleBatchType]]]: + """Get samples from rollout workers for training. + + Args: + return_object_refs: If True, return ObjectRefs instead of the samples + directly. This is useful when using aggregator workers so that data collected on rollout workers is directly de referenced on the aggregator workers instead of first in the driver and then on the aggregator workers. @@ -928,7 +1103,7 @@ def get_samples_from_workers( timeout_seconds=self.config.timeout_s_sampler_manager, return_obj_refs=return_object_refs, ) - elif ( + elif self.config.num_env_runners == 0 or ( self.workers.local_worker() and self.workers.local_worker().async_env is not None ): @@ -943,61 +1118,197 @@ def get_samples_from_workers( return sample_batches - def learn_on_processed_samples(self) -> ResultDict: + @OldAPIStack + def _process_experiences_tree_aggregation( + self, + worker_to_sample_batches_refs: List[Tuple[int, ObjectRef]], + ) -> List[SampleBatchType]: + """Process sample batches using tree aggregation workers. + + Args: + worker_to_sample_batches_refs: List of (worker_id, sample_batch_ref) + + NOTE: This will provide speedup when sample batches have been compressed, + and the decompression can happen on the aggregation workers in parallel to + the training. + + Returns: + Batches that have been processed by the mixin buffers on the aggregation + workers. + + """ + + def _process_episodes(actor, batch): + return actor.process_episodes(ray.get(batch)) + + for _, batch in worker_to_sample_batches_refs: + assert isinstance(batch, ObjectRef), ( + "For efficiency, process_experiences_tree_aggregation should " + f"be given ObjectRefs instead of {type(batch)}." + ) + # Randomly pick an aggregation worker to process this batch. + aggregator_id = random.choice( + self._aggregator_actor_manager.healthy_actor_ids() + ) + calls_placed = self._aggregator_actor_manager.foreach_actor_async( + partial(_process_episodes, batch=batch), + remote_actor_ids=[aggregator_id], + ) + if calls_placed <= 0: + self.metrics.log_value( + "num_times_no_aggregation_worker_available", 1, reduce="sum" + ) + + waiting_processed_sample_batches: RemoteCallResults = ( + self._aggregator_actor_manager.fetch_ready_async_reqs( + timeout_seconds=self.config.timeout_s_aggregator_manager, + ) + ) + _handle_remote_call_result_errors( + waiting_processed_sample_batches, + self.config.ignore_env_runner_failures, + ) + + return [b.get() for b in waiting_processed_sample_batches.ignore_errors()] + + @OldAPIStack + def _process_experiences_directly( + self, + worker_to_sample_batches: List[Tuple[int, SampleBatch]], + ) -> List[SampleBatchType]: + """Process sample batches directly on the driver, for training. + + Args: + worker_to_sample_batches: List of (worker_id, sample_batch) tuples. + + Returns: + Batches that have been processed by the mixin buffer. + + """ + batches = [b for _, b in worker_to_sample_batches] + processed_batches = [] + + for batch in batches: + assert not isinstance( + batch, ObjectRef + ), "_process_experiences_directly can not handle ObjectRefs. " + batch = batch.decompress_if_needed() + # Only make a pass through the buffer, if replay proportion is > 0.0 (and + # we actually have one). + if self.local_mixin_buffer: + self.local_mixin_buffer.add(batch) + batch = self.local_mixin_buffer.replay(_ALL_POLICIES) + if batch: + processed_batches.append(batch) + + return processed_batches + + @OldAPIStack + def _concatenate_batches_and_pre_queue(self, batches: List[SampleBatch]) -> None: + """Concatenate batches that are being returned from rollout workers + + Args: + batches: List of batches of experiences from EnvRunners. + """ + + def aggregate_into_larger_batch(): + if ( + sum(b.count for b in self.batch_being_built) + >= self.config.total_train_batch_size + ): + batch_to_add = concat_samples(self.batch_being_built) + self.data_to_place_on_learner.append(batch_to_add) + self.batch_being_built = [] + + for batch in batches: + # TODO (sven): Strange bug after a RolloutWorker crash and proper + # restart. The bug is related to (old, non-V2) connectors being used and + # seems to happen inside the AgentCollector's `add_action_reward_next_obs` + # method, at the end of which the number of vf_preds (and all other + # extra action outs) in the batch is one smaller than the number of obs/ + # actions/rewards, which then leads to a malformed train batch. + # IMPALA/APPO crash inside the loss function (during v-trace operations) + # b/c of the resulting shape mismatch. The following if-block prevents + # this from happening and it can be removed once we are on the new API + # stack for good (and use the new connectors and also no longer + # AgentCollectors, RolloutWorkers, Policies, TrajectoryView API, etc..): + if ( + self.config.batch_mode == "truncate_episodes" + and self.config.enable_connectors + and self.config.recreate_failed_env_runners + ): + if any( + SampleBatch.VF_PREDS in pb + and ( + pb[SampleBatch.VF_PREDS].shape[0] + != pb[SampleBatch.REWARDS].shape[0] + ) + for pb in batch.policy_batches.values() + ): + continue + + self.batch_being_built.append(batch) + aggregate_into_larger_batch() + + @OldAPIStack + def _learn_on_processed_samples(self) -> ResultDict: """Update the learner group with the latest batch of processed samples. Returns: Aggregated results from the learner group after an update is completed. """ + # Nothing on the queue -> Don't send requests to learner group + # or no results ready (from previous `self.learner_group.update()` calls) for + # reducing. + if not self.data_to_place_on_learner: + return {} + # There are batches on the queue -> Send them all to the learner group. - if self.batches_to_place_on_learner: - batches = self.batches_to_place_on_learner[:] - self.batches_to_place_on_learner.clear() - # If there are no learner workers and learning is directly on the driver - # Then we can't do async updates, so we need to block. - async_update = self.config.num_learners > 0 - results = [] - for batch in batches: - result = self.learner_group.update_from_batch( - batch=batch, - timesteps={ - NUM_ENV_STEPS_SAMPLED_LIFETIME: ( - self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME) - ), - }, - async_update=async_update, - num_iters=self.config.num_sgd_iter, - minibatch_size=self.config.minibatch_size, - ) - if not async_update: - results = [result] + batches = self.data_to_place_on_learner[:] + self.data_to_place_on_learner.clear() - for r in results: - self._counters[NUM_ENV_STEPS_TRAINED] += r[ALL_MODULES].pop( - NUM_ENV_STEPS_TRAINED - ) - self._counters[NUM_AGENT_STEPS_TRAINED] += r[ALL_MODULES].pop( - NUM_MODULE_STEPS_TRAINED - ) + # If there are no learner workers and learning is directly on the driver + # Then we can't do async updates, so we need to block. + async_update = self.config.num_learners > 0 + results = [] + for batch in batches: + results = self.learner_group.update_from_batch( + batch=batch, + timesteps={ + NUM_ENV_STEPS_SAMPLED_LIFETIME: ( + self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME) + ), + }, + async_update=async_update, + num_iters=self.config.num_sgd_iter, + minibatch_size=self.config.minibatch_size, + ) + if not async_update: + results = [results] - self._counters.update(self.learner_group.get_stats()) - # If there are results, reduce-mean over each individual value and return. - if results: - return tree.map_structure(lambda *x: np.mean(x), *results) + for r in results: + self._counters[NUM_ENV_STEPS_TRAINED] += r[ALL_MODULES].pop( + NUM_ENV_STEPS_TRAINED + ) + self._counters[NUM_AGENT_STEPS_TRAINED] += r[ALL_MODULES].pop( + NUM_MODULE_STEPS_TRAINED + ) + + self._counters.update(self.learner_group.get_stats()) + # If there are results, reduce-mean over each individual value and return. + if results: + return tree.map_structure(lambda *x: np.mean(x), *results) # Nothing on the queue -> Don't send requests to learner group # or no results ready (from previous `self.learner_group.update_from_batch()` # calls) for reducing. return {} - def place_processed_samples_on_learner_thread_queue(self) -> None: - """Place processed samples on the learner queue for training. - - NOTE: This method is called if self.config.enable_rl_module_and_learner is - False. - """ - for i, batch in enumerate(self.batches_to_place_on_learner): + @OldAPIStack + def _place_processed_samples_on_learner_thread_queue(self) -> None: + """Place processed samples on the learner queue for training.""" + for i, batch in enumerate(self.data_to_place_on_learner): try: self._learner_thread.inqueue.put( batch, @@ -1006,7 +1317,7 @@ def place_processed_samples_on_learner_thread_queue(self) -> None: # from thrashing when there are more samples than the learner can # reasonably process. # see https://github.com/ray-project/ray/pull/26581#issuecomment-1187877674 # noqa - block=i == len(self.batches_to_place_on_learner) - 1, + block=i == len(self.data_to_place_on_learner) - 1, ) self._counters["num_samples_added_to_queue"] += ( batch.agent_steps() @@ -1016,14 +1327,12 @@ def place_processed_samples_on_learner_thread_queue(self) -> None: except queue.Full: self._counters["num_times_learner_queue_full"] += 1 - self.batches_to_place_on_learner.clear() + self.data_to_place_on_learner.clear() - def process_trained_results(self) -> ResultDict: + @OldAPIStack + def _process_trained_results(self) -> ResultDict: """Process training results that are outputed by the learner thread. - NOTE: This method is called if self.config.enable_rl_module_and_learner is - False. - Returns: Aggregated results from the learner thread after an update is completed. @@ -1059,131 +1368,8 @@ def process_trained_results(self) -> ResultDict: return final_learner_info - def process_experiences_directly( - self, - worker_to_sample_batches: List[Tuple[int, SampleBatch]], - ) -> List[SampleBatchType]: - """Process sample batches directly on the driver, for training. - - Args: - worker_to_sample_batches: List of (worker_id, sample_batch) tuples. - - Returns: - Batches that have been processed by the mixin buffer. - - """ - batches = [b for _, b in worker_to_sample_batches] - processed_batches = [] - - for batch in batches: - assert not isinstance( - batch, ObjectRef - ), "process_experiences_directly can not handle ObjectRefs. " - batch = batch.decompress_if_needed() - # Only make a pass through the buffer, if replay proportion is > 0.0 (and - # we actually have one). - if self.local_mixin_buffer: - self.local_mixin_buffer.add(batch) - batch = self.local_mixin_buffer.replay(_ALL_POLICIES) - if batch: - processed_batches.append(batch) - - return processed_batches - - def process_experiences_tree_aggregation( - self, - worker_to_sample_batches_refs: List[Tuple[int, ObjectRef]], - ) -> List[SampleBatchType]: - """Process sample batches using tree aggregation workers. - - Args: - worker_to_sample_batches_refs: List of (worker_id, sample_batch_ref) - - NOTE: This will provide speedup when sample batches have been compressed, - and the decompression can happen on the aggregation workers in parallel to - the training. - - Returns: - Batches that have been processed by the mixin buffers on the aggregation - workers. - - """ - - def _process_episodes(actor, batch): - return actor.process_episodes(ray.get(batch)) - - for _, batch in worker_to_sample_batches_refs: - assert isinstance(batch, ObjectRef), ( - "For efficiency, process_experiences_tree_aggregation should " - f"be given ObjectRefs instead of {type(batch)}." - ) - # Randomly pick an aggregation worker to process this batch. - aggregator_id = random.choice( - self._aggregator_actor_manager.healthy_actor_ids() - ) - calls_placed = self._aggregator_actor_manager.foreach_actor_async( - partial(_process_episodes, batch=batch), - remote_actor_ids=[aggregator_id], - ) - if calls_placed <= 0: - self._counters["num_times_no_aggregation_worker_available"] += 1 - - waiting_processed_sample_batches: RemoteCallResults = ( - self._aggregator_actor_manager.fetch_ready_async_reqs( - timeout_seconds=self._timeout_s_aggregator_manager, - ) - ) - _handle_remote_call_result_errors( - waiting_processed_sample_batches, - self.config.ignore_env_runner_failures, - ) - - return [b.get() for b in waiting_processed_sample_batches.ignore_errors()] - - def update_workers_from_learner_group( - self, - workers_that_need_updates: Set[int], - policy_ids: Optional[List[PolicyID]] = None, - ): - """Updates all RolloutWorkers that require updating. - - Updates only if NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS has been - reached and the worker has sent samples in this iteration. Also only updates - those policies, whose IDs are given via `policies` (if None, update all - policies). - - Args: - workers_that_need_updates: Set of worker IDs that need to be updated. - policy_ids: Optional list of Policy IDs to update. If None, will update all - policies on the to-be-updated workers. - """ - # Only need to update workers if there are remote workers. - self._counters[NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS] += 1 - if ( - self._counters[NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS] - >= self.config.broadcast_interval - and workers_that_need_updates - ): - self._counters[NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS] = 0 - self._counters[NUM_SYNCH_WORKER_WEIGHTS] += 1 - weights = self.learner_group.get_weights(policy_ids) - if self.config.num_env_runners == 0: - worker = self.workers.local_worker() - worker.set_weights(weights) - else: - weights_ref = ray.put(weights) - self.workers.foreach_worker( - func=lambda w: w.set_weights(ray.get(weights_ref)), - local_worker=False, - remote_worker_ids=list(workers_that_need_updates), - timeout_seconds=0, # Don't wait for the workers to finish. - ) - # If we have a local worker that we sample from in addition to - # our remote workers, we need to update its weights as well. - if self.config.create_env_on_local_worker: - self.workers.local_worker().set_weights(weights) - - def update_workers_if_necessary( + @OldAPIStack + def _update_workers_old_api_stack( self, workers_that_need_updates: Set[int], policy_ids: Optional[List[PolicyID]] = None, @@ -1240,14 +1426,6 @@ def update_workers_if_necessary( timeout_seconds=0, # Don't wait for the workers to finish. ) - def _get_additional_update_kwargs(self, train_results: dict) -> dict: - """Returns the kwargs to `LearnerGroup.additional_update()`. - - Should be overridden by subclasses to specify wanted/needed kwargs for - their own implementation of `Learner.additional_update_for_module()`. - """ - return {} - @override(Algorithm) def _compile_iteration_results_old_and_hybrid_api_stacks(self, *args, **kwargs): result = super()._compile_iteration_results_old_and_hybrid_api_stacks( @@ -1260,8 +1438,35 @@ def _compile_iteration_results_old_and_hybrid_api_stacks(self, *args, **kwargs): return result +@DeveloperAPI +@ray.remote(num_cpus=0, max_restarts=-1) +class AggregationWorker(FaultAwareApply): + """A worker performing LearnerConnector pass throughs of collected episodes.""" + + def __init__(self, config: AlgorithmConfig): + self.config = config + self._learner_connector = self.config.build_learner_connector( + input_observation_space=None, + input_action_space=None, + ) + self._rl_module = None + + def process_episodes(self, episodes): + batch = self._learner_connector( + batch={}, + episodes=episodes, + rl_module=self._rl_module, + shared_data={}, + ) + return batch + + def get_host(self) -> str: + return platform.node() + + +@OldAPIStack @ray.remote(num_cpus=0, max_restarts=-1) -class AggregatorWorker(FaultAwareApply): +class AggregatorWorker_OldAPIStack(FaultAwareApply): """A worker for doing tree aggregation of collected episodes""" def __init__(self, config: AlgorithmConfig): @@ -1284,3 +1489,46 @@ def process_episodes(self, batch: SampleBatchType) -> SampleBatchType: def get_host(self) -> str: return platform.node() + + +@OldAPIStack +def make_learner_thread(local_worker, config): + if not config["simple_optimizer"]: + logger.info( + "Enabling multi-GPU mode, {} GPUs, {} parallel tower-stacks".format( + config["num_gpus"], config["num_multi_gpu_tower_stacks"] + ) + ) + num_stacks = config["num_multi_gpu_tower_stacks"] + buffer_size = config["minibatch_buffer_size"] + if num_stacks < buffer_size: + logger.warning( + "In multi-GPU mode you should have at least as many " + "multi-GPU tower stacks (to load data into on one device) as " + "you have stack-index slots in the buffer! You have " + f"configured {num_stacks} stacks and a buffer of size " + f"{buffer_size}. Setting " + f"`minibatch_buffer_size={num_stacks}`." + ) + config["minibatch_buffer_size"] = num_stacks + + learner_thread = MultiGPULearnerThread( + local_worker, + num_gpus=config["num_gpus"], + lr=config["lr"], + train_batch_size=config["train_batch_size"], + num_multi_gpu_tower_stacks=config["num_multi_gpu_tower_stacks"], + num_sgd_iter=config["num_sgd_iter"], + learner_queue_size=config["learner_queue_size"], + learner_queue_timeout=config["learner_queue_timeout"], + num_data_load_threads=config["num_gpu_loader_threads"], + ) + else: + learner_thread = LearnerThread( + local_worker, + minibatch_buffer_size=config["minibatch_buffer_size"], + num_sgd_iter=config["num_sgd_iter"], + learner_queue_size=config["learner_queue_size"], + learner_queue_timeout=config["learner_queue_timeout"], + ) + return learner_thread diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py index 40d37461ff25..ce67ac8a1e1c 100644 --- a/rllib/algorithms/impala/impala_learner.py +++ b/rllib/algorithms/impala/impala_learner.py @@ -1,26 +1,42 @@ -import abc -from typing import Dict +from collections import deque +import copy +from queue import Empty, Queue +import threading +import time +from typing import Any, Dict, List, Optional -import numpy as np +import tree # pip install dm_tree -from ray.rllib.algorithms.impala.impala import ( - ImpalaConfig, - LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY, -) +import ray +from ray.rllib.algorithms.impala.impala import LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY from ray.rllib.core.columns import Columns from ray.rllib.core.learner.learner import Learner +from ray.rllib.connectors.learner import AddOneTsToEpisodesAndTruncate +from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.postprocessing.episodes import ( - add_one_ts_to_episodes_and_truncate, - remove_last_ts_from_data, - remove_last_ts_from_episodes_and_restore_truncateds, +from ray.rllib.utils.metrics import ( + ALL_MODULES, + NUM_ENV_STEPS_SAMPLED_LIFETIME, + NUM_ENV_STEPS_TRAINED, ) -from ray.rllib.utils.postprocessing.value_predictions import extract_bootstrapped_values -from ray.rllib.utils.postprocessing.zero_padding import unpad_data_if_necessary +from ray.rllib.utils.metrics.metrics_logger import MetricsLogger from ray.rllib.utils.schedules.scheduler import Scheduler -from ray.rllib.utils.typing import ModuleID +from ray.rllib.utils.typing import EpisodeType, ModuleID, ResultDict + +torch, _ = try_import_torch() + +GPU_LOADER_QUEUE_WAIT_TIMER = "gpu_loader_queue_wait_timer" +GPU_LOADER_LOAD_TO_GPU_TIMER = "gpu_loader_load_to_gpu_timer" +LEARNER_THREAD_IN_QUEUE_WAIT_TIMER = "learner_thread_in_queue_wait_timer" +LEARNER_THREAD_UPDATE_TIMER = "learner_thread_update_timer" +RAY_GET_EPISODES_TIMER = "ray_get_episodes_timer" +EPISODES_TO_BATCH_TIMER = "episodes_to_batch_timer" + +QUEUE_SIZE_GPU_LOADER_QUEUE = "queue_size_gpu_loader_queue" +QUEUE_SIZE_LEARNER_THREAD_QUEUE = "queue_size_learner_thread_queue" +QUEUE_SIZE_RESULTS_QUEUE = "queue_size_results_queue" class ImpalaLearner(Learner): @@ -41,86 +57,201 @@ def build(self) -> None: ) ) - def _compute_v_trace_from_episodes( + # Extend all episodes by one artificual timestep to allow the value function net + # to compute the bootstrap values (and add a mask to the batch to know, which + # slots to mask out). + if self.config.add_default_connectors_to_learner_pipeline: + self._learner_connector.prepend(AddOneTsToEpisodesAndTruncate()) + + # Create and start the GPU-loader thread. It picks up train-ready batches from + # the "GPU-loader queue" and loads them to the GPU, then places the GPU batches + # on the "update queue" for the actual RLModule forward pass and loss + # computations. + self._gpu_loader_in_queue = Queue() + self._learner_thread_in_queue = deque(maxlen=self.config.learner_queue_size) + self._learner_thread_out_queue = Queue() + + # Create and start the GPU loader thread(s). + self._gpu_loader_threads = [ + _GPULoaderThread( + in_queue=self._gpu_loader_in_queue, + out_queue=self._learner_thread_in_queue, + device=self._device, + metrics_logger=self.metrics, + ) + for _ in range(self.config.num_gpu_loader_threads) + ] + for t in self._gpu_loader_threads: + t.start() + + # Create and start the Learner thread. + self._learner_thread = _LearnerThread( + update_method=self._update_from_batch_or_episodes, + in_queue=self._learner_thread_in_queue, + out_queue=self._learner_thread_out_queue, + metrics_logger=self.metrics, + ) + self._learner_thread.start() + + @override(Learner) + def update_from_episodes( self, + episodes: List[EpisodeType], *, - batch, - episodes, - ): - batch = batch or {} - if not episodes: - return batch, episodes - - # Make all episodes one ts longer in order to just have a single batch - # (and distributed forward pass) for both vf predictions AND the bootstrap - # vf computations. - episode_lens = [len(e) for e in episodes] - orig_truncateds = add_one_ts_to_episodes_and_truncate(episodes) - episode_lens_p1 = [len(e) for e in episodes] - - # Call the learner connector (on the artificially elongated episodes) - # in order to get the batch to pass through the module for vf (and - # bootstrapped vf) computations. - batch_for_vf = self._learner_connector( - rl_module=self.module["default_policy"], # TODO: make multi-agent capable - data={}, - episodes=episodes, - ) - # Perform the value model's forward pass. - vf_preds = convert_to_numpy(self._compute_values(batch_for_vf)) - - # Remove all zero-padding again, if applicable, for the upcoming - # GAE computations. - vf_preds = unpad_data_if_necessary(episode_lens_p1, vf_preds) - # Generate the bootstrap value column (with only one entry per batch row). - batch[Columns.VALUES_BOOTSTRAPPED] = extract_bootstrapped_values( - vf_preds=vf_preds, - episode_lengths=episode_lens, - T=self.config.get_rollout_fragment_length(), + timesteps: Optional[Dict[str, Any]] = None, + # TODO (sven): Deprecate these in favor of config attributes for only those + # algos that actually need (and know how) to do minibatching. + minibatch_size: Optional[int] = None, + num_iters: int = 1, + min_total_mini_batches: int = 0, + reduce_fn=None, # Deprecated args. + **kwargs, + ) -> ResultDict: + # TODO (sven): IMPALA does NOT call additional update anymore from its + # `training_step()` method. Instead, we'll do this here (to avoid the extra + # metrics.reduce() call -> we should only call this once per update round). + self._before_update(timesteps) + + with self.metrics.log_time((ALL_MODULES, RAY_GET_EPISODES_TIMER)): + # Resolve batch/episodes being ray object refs (instead of + # actual batch/episodes objects). + episodes = ray.get(episodes) + episodes = tree.flatten(episodes) + env_steps = sum(map(len, episodes)) + + # Call the learner connector pipeline. + with self.metrics.log_time((ALL_MODULES, EPISODES_TO_BATCH_TIMER)): + batch = self._learner_connector( + rl_module=self.module, + data={}, + episodes=episodes, + shared_data={}, + ) + + # Queue the CPU batch to the GPU-loader thread. + self._gpu_loader_in_queue.put((batch, env_steps)) + self.metrics.log_value( + QUEUE_SIZE_GPU_LOADER_QUEUE, self._gpu_loader_in_queue.qsize() ) - # Remove the extra timesteps again from vf_preds and value targets. Now that - # the GAE computation is done, we don't need this last timestep anymore in any - # of our data. - batch[Columns.VF_PREDS] = remove_last_ts_from_data(episode_lens_p1, vf_preds) - # Remove the extra (artificial) timesteps again at the end of all episodes. - remove_last_ts_from_episodes_and_restore_truncateds(episodes, orig_truncateds) + # Return all queued result dicts thus far (after reducing over them). + results = {} + ts_trained = 0 + try: + while True: + results = self._learner_thread_out_queue.get(block=False) + ts_trained += results[ALL_MODULES][NUM_ENV_STEPS_TRAINED].peek() + except Empty: + if ts_trained: + results[ALL_MODULES][NUM_ENV_STEPS_TRAINED].values = [ts_trained] + return results - return batch, episodes + def _before_update(self, timesteps: Optional[Dict[str, Any]] = None): + timesteps = timesteps or {} + + for module_id in self.module.keys(): + super().additional_update_for_module( + module_id=module_id, + config=self.config.get_config_for_module(module_id), + timestep=timesteps.get(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0), + ) + + # Update entropy coefficient via our Scheduler. + new_entropy_coeff = self.entropy_coeff_schedulers_per_module[ + module_id + ].update(timestep=timesteps.get(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0)) + self.metrics.log_value( + (module_id, LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY), + new_entropy_coeff, + window=1, + ) @override(Learner) def remove_module(self, module_id: str): super().remove_module(module_id) self.entropy_coeff_schedulers_per_module.pop(module_id) - @override(Learner) - def additional_update_for_module( - self, *, module_id: ModuleID, config: ImpalaConfig, timestep: int - ) -> None: - super().additional_update_for_module( - module_id=module_id, config=config, timestep=timestep - ) - # Update entropy coefficient via our Scheduler. - new_entropy_coeff = self.entropy_coeff_schedulers_per_module[module_id].update( - timestep=timestep - ) - self.metrics.log_value( - (module_id, LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY), - new_entropy_coeff, - window=1, - ) +class _GPULoaderThread(threading.Thread): + def __init__( + self, + *, + in_queue: Queue, + out_queue: deque, + device: torch.device, + metrics_logger: MetricsLogger, + ): + super().__init__() + self.daemon = True + + self._in_queue = in_queue + self._out_queue = out_queue + self._device = device + self.metrics = metrics_logger + + def run(self) -> None: + while True: + self._step() + + def _step(self) -> None: + # Only measure time, if we have a `metrics` instance. + with self.metrics.log_time((ALL_MODULES, GPU_LOADER_QUEUE_WAIT_TIMER)): + # Get a new batch from the data (inqueue). + batch_on_cpu, env_steps = self._in_queue.get() + + with self.metrics.log_time((ALL_MODULES, GPU_LOADER_LOAD_TO_GPU_TIMER)): + # Load the batch onto the GPU device. + batch_on_gpu = tree.map_structure_with_path( + lambda path, t: ( + t + if isinstance(path, tuple) and Columns.INFOS in path + else t.to(self._device, non_blocking=True) + ), + batch_on_cpu, + ) + ma_batch_on_gpu = MultiAgentBatch( + policy_batches={mid: SampleBatch(b) for mid, b in batch_on_gpu.items()}, + env_steps=env_steps, + ) + self._out_queue.append(ma_batch_on_gpu) + self.metrics.log_value( + QUEUE_SIZE_LEARNER_THREAD_QUEUE, len(self._out_queue) + ) + + +class _LearnerThread(threading.Thread): + def __init__(self, *, update_method, in_queue, out_queue, metrics_logger): + super().__init__() + self.daemon = True + self.metrics: MetricsLogger = metrics_logger + self.stopped = False + + self._update_method = update_method + self._in_queue: deque = in_queue + self._out_queue: Queue = out_queue + + def run(self) -> None: + while not self.stopped: + self.step() - @abc.abstractmethod - def _compute_values(self, batch) -> np._typing.NDArray: - """Computes the values using the value function module given a batch of data. + def step(self): + # Get a new batch from the GPU-data (deque.pop -> newest item first). + with self.metrics.log_time((ALL_MODULES, LEARNER_THREAD_IN_QUEUE_WAIT_TIMER)): + if not self._in_queue: + time.sleep(0.001) + return + ma_batch_on_gpu = self._in_queue.pop() - Args: - batch: The input batch to pass through our RLModule (value function - encoder and vf-head). + # Call the update method on the batch. + with self.metrics.log_time((ALL_MODULES, LEARNER_THREAD_UPDATE_TIMER)): + # TODO (sven): For multi-agent AND SGD iter > 1, we need to make sure + # this thread has the information about the min minibatches necessary + # (due to different agents taking different steps in the env, e.g. + # MA-CartPole). + results = self._update_method(batch=ma_batch_on_gpu) + # We have to deepcopy the results dict, b/c we must avoid having a returned + # Stats object sit in the queue and getting a new (possibly even tensor) + # value added to it, which would falsify this result. + self._out_queue.put(copy.deepcopy(results)) - Returns: - The batch (numpy) of value function outputs (already squeezed over the last - dimension (which should have shape (1,) b/c of the single value output - node). - """ + self.metrics.log_value(QUEUE_SIZE_RESULTS_QUEUE, self._out_queue.qsize()) diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py deleted file mode 100644 index 72cdf43ff302..000000000000 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ /dev/null @@ -1,106 +0,0 @@ -import unittest - -import numpy as np -import tree # pip install dm_tree - -import ray -from ray.rllib.algorithms.impala import ImpalaConfig -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.framework import try_import_torch, try_import_tf -from ray.rllib.utils.test_utils import framework_iterator -from ray.rllib.utils.torch_utils import convert_to_torch_tensor - -torch, nn = try_import_torch() -tf1, tf, _ = try_import_tf() -tf1.enable_eager_execution() - -frag_length = 50 - -FAKE_BATCH = { - SampleBatch.OBS: np.random.uniform(low=0, high=1, size=(frag_length, 4)).astype( - np.float32 - ), - SampleBatch.ACTIONS: np.random.choice(2, frag_length).astype(np.float32), - SampleBatch.REWARDS: np.random.uniform(low=-1, high=1, size=(frag_length,)).astype( - np.float32 - ), - SampleBatch.TERMINATEDS: np.array( - [False for _ in range(frag_length - 1)] + [True] - ).astype(np.float32), - SampleBatch.VF_PREDS: np.array( - list(reversed(range(frag_length))), dtype=np.float32 - ), - SampleBatch.VALUES_BOOTSTRAPPED: np.array( - list(reversed(range(frag_length))), dtype=np.float32 - ), - SampleBatch.ACTION_LOGP: np.log( - np.random.uniform(low=0, high=1, size=(frag_length,)) - ).astype(np.float32), - SampleBatch.ACTION_DIST_INPUTS: np.random.normal( - 0, 1, size=(frag_length, 2) - ).astype(np.float32), -} - - -class TestImpalaLearner(unittest.TestCase): - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def test_impala_loss(self): - """Test that impala_policy_rlm loss matches the impala learner loss. - - Correctness of V-Trance is tested in test_vtrace_v2.py. - """ - config = ( - ImpalaConfig() - .api_stack(enable_rl_module_and_learner=True) - .environment("CartPole-v1") - .env_runners( - num_env_runners=0, - rollout_fragment_length=frag_length, - ) - .resources(num_gpus=0) - .training( - gamma=0.99, - model=dict( - fcnet_hiddens=[10, 10], - fcnet_activation="linear", - vf_share_layers=False, - ), - ) - ) - # TODO (Artur): See if setting Impala's default to {} pose any issues. - # Deprecate the current default and set it to {}. - config.exploration_config = {} - - for fw in framework_iterator(config, frameworks=["torch", "tf2"]): - algo = config.build() - - if fw == "torch": - train_batch = convert_to_torch_tensor(SampleBatch(FAKE_BATCH)) - else: - train_batch = SampleBatch( - tree.map_structure(lambda x: tf.convert_to_tensor(x), FAKE_BATCH) - ) - - algo_config = config.copy(copy_frozen=False) - algo_config.num_learners = 0 - learner_group = algo_config.build_learner_group( - env=algo.workers.local_worker().env - ) - learner_group.set_weights(algo.get_weights()) - learner_group.update_from_batch(batch=train_batch.as_multi_agent()) - - algo.stop() - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/algorithms/impala/tests/test_impala_off_policyness.py b/rllib/algorithms/impala/tests/test_impala_off_policyness.py deleted file mode 100644 index 7e25c5d97763..000000000000 --- a/rllib/algorithms/impala/tests/test_impala_off_policyness.py +++ /dev/null @@ -1,60 +0,0 @@ -import unittest - -import ray -import ray.rllib.algorithms.impala as impala -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.test_utils import ( - check_compute_single_action, - framework_iterator, -) - -tf1, tf, tfv = try_import_tf() - - -class TestIMPALAOffPolicyNess(unittest.TestCase): - @classmethod - def setUpClass(cls) -> None: - ray.init() - - @classmethod - def tearDownClass(cls) -> None: - ray.shutdown() - - def test_impala_off_policyness(self): - config = ( - impala.ImpalaConfig() - .api_stack(enable_rl_module_and_learner=True) - .environment("CartPole-v1") - .resources(num_gpus=0) - .env_runners(num_env_runners=4) - ) - num_iterations = 3 - num_aggregation_workers_options = [0, 1] - - for num_aggregation_workers in num_aggregation_workers_options: - for _ in framework_iterator(config, frameworks=("torch", "tf2")): - - # We have to set exploration_config here manually because setting - # it through config.env_runners() only deepupdates it - config.exploration_config = {} - config.num_aggregation_workers = num_aggregation_workers - print("aggregation-workers={}".format(config.num_aggregation_workers)) - algo = config.build() - for i in range(num_iterations): - algo.train() - # TODO (Avnish): Add off-policiness check when the metrics are - # added back to the IMPALA Learner. - # off_policy_ness = check_off_policyness(results, upper_limit=2.0) - # print(f"off-policy'ness={off_policy_ness}") - - check_compute_single_action( - algo, - ) - algo.stop() - - -if __name__ == "__main__": - import pytest - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/algorithms/impala/tf/impala_tf_learner.py b/rllib/algorithms/impala/tf/impala_tf_learner.py index 51f4e27e8dfb..c89e8665e65e 100644 --- a/rllib/algorithms/impala/tf/impala_tf_learner.py +++ b/rllib/algorithms/impala/tf/impala_tf_learner.py @@ -136,7 +136,6 @@ def compute_loss_for_module( # Return the total loss. return total_loss - @override(ImpalaLearner) def _compute_values(self, batch): infos = batch.pop(Columns.INFOS, None) batch = tree.map_structure(lambda s: tf.convert_to_tensor(s), batch) diff --git a/rllib/algorithms/impala/torch/impala_torch_learner.py b/rllib/algorithms/impala/torch/impala_torch_learner.py index 88d6849e9339..33102e6ab3f7 100644 --- a/rllib/algorithms/impala/torch/impala_torch_learner.py +++ b/rllib/algorithms/impala/torch/impala_torch_learner.py @@ -6,15 +6,12 @@ vtrace_torch, make_time_major, ) -from ray.rllib.core import DEFAULT_MODULE_ID from ray.rllib.core.columns import Columns from ray.rllib.core.learner.learner import ENTROPY_KEY from ray.rllib.core.learner.torch.torch_learner import TorchLearner -from ray.rllib.core.models.base import CRITIC, ENCODER_OUT from ray.rllib.utils.annotations import override from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_utils import convert_to_torch_tensor from ray.rllib.utils.typing import ModuleID, TensorType torch, nn = try_import_torch() @@ -32,18 +29,51 @@ def compute_loss_for_module( batch: NestedDict, fwd_out: Dict[str, TensorType], ) -> TensorType: - action_dist_class_train = ( - self.module[module_id].unwrapped().get_train_action_dist_cls() - ) - target_policy_dist = action_dist_class_train.from_logits( - fwd_out[Columns.ACTION_DIST_INPUTS] + # TODO (sven): Now that we do the +1ts trick to be less vulnerable about + # bootstrap values at the end of rollouts in the new stack, we might make + # this a more flexible, configurable parameter for users, e.g. + # `v_trace_seq_len` (independent of `rollout_fragment_length`). Separation + # of concerns (sampling vs learning). + recurrent_seq_len = None + rollout_frag_or_episode_len = config.get_rollout_fragment_length() + + loss_mask = batch[Columns.LOSS_MASK].float() + loss_mask_time_major = make_time_major( + loss_mask, + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, ) - values = fwd_out[Columns.VF_PREDS] + size_loss_mask = torch.sum(loss_mask) + # Behavior actions logp and target actions logp. behaviour_actions_logp = batch[Columns.ACTION_LOGP] + target_policy_dist = ( + self.module[module_id] + .unwrapped() + .get_train_action_dist_cls() + .from_logits(fwd_out[Columns.ACTION_DIST_INPUTS]) + ) target_actions_logp = target_policy_dist.logp(batch[Columns.ACTIONS]) - rollout_frag_or_episode_len = config.get_rollout_fragment_length() - recurrent_seq_len = None + + # Values and bootstrap values. + values_time_major = make_time_major( + fwd_out[Columns.VF_PREDS], + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ) + assert Columns.VALUES_BOOTSTRAPPED not in batch + # Use as bootstrap values the vf-preds in the next "batch row", except + # for the very last row (which doesn't have a next row), for which the + # bootstrap value does not matter b/c it has a +1ts value at its end + # anyways. So we chose an arbitrary item (for simplicity of not having to + # move new data to the device). + bootstrap_values = torch.cat( + [ + values_time_major[0][1:], # 0th ts values from "next row" + values_time_major[0][0:1], # <- can use any arbitrary value here + ], + dim=0, + ) # TODO(Artur): In the old impala code, actions were unsqueezed if they were # multi_discrete. Find out why and if we need to do the same here. @@ -63,20 +93,6 @@ def compute_loss_for_module( trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - values_time_major = make_time_major( - values, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=recurrent_seq_len, - ) - if self.config.enable_env_runner_and_connector_v2: - bootstrap_values = batch[Columns.VALUES_BOOTSTRAPPED] - else: - bootstrap_values_time_major = make_time_major( - batch[Columns.VALUES_BOOTSTRAPPED], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=recurrent_seq_len, - ) - bootstrap_values = bootstrap_values_time_major[-1] # the discount factor that is used should be gamma except for timesteps where # the episode is terminated. In that case, the discount factor should be 0. @@ -89,10 +105,6 @@ def compute_loss_for_module( ).type(dtype=torch.float32) ) * config.gamma - # TODO(Artur) Why was there `TorchCategorical if is_multidiscrete else - # dist_class` in the old code torch impala policy? - device = behaviour_actions_logp_time_major[0].device - # Note that vtrace will compute the main loop on the CPU for better performance. vtrace_adjusted_target_values, pg_advantages = vtrace_torch( target_action_log_probs=target_actions_logp_time_major, @@ -105,32 +117,25 @@ def compute_loss_for_module( clip_pg_rho_threshold=config.vtrace_clip_pg_rho_threshold, ) - # Sample size is T x B, where T is the trajectory length and B is the batch size - # We mean over the batch size for consistency with the pre-RLModule - # implementation of IMPALA - # TODO(Artur): Mean over trajectory length after migration to RLModules. - batch_size = ( - convert_to_torch_tensor(target_actions_logp_time_major.shape[-1]) - .float() - .to(device) - ) - # The policy gradients loss. - pi_loss = -torch.sum(target_actions_logp_time_major * pg_advantages) - mean_pi_loss = pi_loss / batch_size + pi_loss = -torch.sum( + target_actions_logp_time_major * pg_advantages * loss_mask_time_major + ) + mean_pi_loss = pi_loss / size_loss_mask # The baseline loss. delta = values_time_major - vtrace_adjusted_target_values - vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0)) - mean_vf_loss = vf_loss / batch_size + vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0) * loss_mask_time_major) + mean_vf_loss = vf_loss / size_loss_mask # The entropy loss. - mean_entropy_loss = -torch.mean(target_policy_dist.entropy()) + entropy_loss = -torch.sum(target_policy_dist.entropy() * loss_mask) + mean_entropy_loss = entropy_loss / size_loss_mask # The summed weighted loss. total_loss = ( - pi_loss - + vf_loss * config.vf_loss_coeff + mean_pi_loss + + mean_vf_loss * config.vf_loss_coeff + ( mean_entropy_loss * self.entropy_coeff_schedulers_per_module[ @@ -142,8 +147,10 @@ def compute_loss_for_module( # Log important loss stats. self.metrics.log_dict( { - "pi_loss": mean_pi_loss, - "vf_loss": mean_vf_loss, + "pi_loss": pi_loss, + "mean_pi_loss": mean_pi_loss, + "vf_loss": vf_loss, + "mean_vf_loss": mean_vf_loss, ENTROPY_KEY: -mean_entropy_loss, }, key=module_id, @@ -151,21 +158,3 @@ def compute_loss_for_module( ) # Return the total loss. return total_loss - - @override(ImpalaLearner) - def _compute_values(self, batch): - infos = batch.pop(Columns.INFOS, None) - batch = convert_to_torch_tensor(batch, device=self._device) - # batch = tree.map_structure(lambda s: torch.from_numpy(s), batch) - if infos is not None: - batch[Columns.INFOS] = infos - - # TODO (sven): Make multi-agent capable. - module = self.module[DEFAULT_MODULE_ID].unwrapped() - - # Shared encoder. - encoder_outs = module.encoder(batch) - # Value head. - vf_out = module.vf(encoder_outs[ENCODER_OUT][CRITIC]) - # Squeeze out last dimension (single node value head). - return vf_out.squeeze(-1) diff --git a/rllib/algorithms/impala/torch/vtrace_torch_v2.py b/rllib/algorithms/impala/torch/vtrace_torch_v2.py index 8f8be2a63590..45360546522d 100644 --- a/rllib/algorithms/impala/torch/vtrace_torch_v2.py +++ b/rllib/algorithms/impala/torch/vtrace_torch_v2.py @@ -35,22 +35,36 @@ def make_time_major( trajectory_len is None or recurrent_seq_len is None ), "Either trajectory_len or recurrent_seq_len must be set." + # Figure out the sizes of the final B and T axes. if recurrent_seq_len: B = recurrent_seq_len.shape[0] T = tensor.shape[0] // B else: - # Important: chop the tensor into batches at known episode cut - # boundaries. - # TODO: (sven) this is kind of a hack and won't work for - # batch_mode=complete_episodes. T = trajectory_len - B = tensor.shape[0] // T - rs = torch.reshape(tensor, [B, T] + list(tensor.shape[1:])) + # Zero-pad, if necessary. + tensor_0 = tensor.shape[0] + B = tensor_0 // T + if B != (tensor_0 / T): + assert len(tensor.shape) == 1 + tensor = torch.cat( + [ + tensor, + torch.zeros( + trajectory_len - tensor_0 % T, + dtype=tensor.dtype, + device=tensor.device, + ), + ] + ) + B += 1 + + # Reshape tensor (break up B axis into 2 axes: B and T). + tensor = torch.reshape(tensor, [B, T] + list(tensor.shape[1:])) # Swap B and T axes. - res = torch.transpose(rs, 1, 0) + tensor = torch.transpose(tensor, 1, 0) - return res + return tensor def vtrace_torch( diff --git a/rllib/algorithms/ppo/ppo_learner.py b/rllib/algorithms/ppo/ppo_learner.py index 5070859aa0e6..37b1735d02c9 100644 --- a/rllib/algorithms/ppo/ppo_learner.py +++ b/rllib/algorithms/ppo/ppo_learner.py @@ -7,7 +7,6 @@ from ray.rllib.core.columns import Columns from ray.rllib.core.learner.learner import Learner from ray.rllib.evaluation.postprocessing import Postprocessing -from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch from ray.rllib.utils.annotations import override, OverrideToImplementCustomLogic from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict from ray.rllib.utils.numpy import convert_to_numpy @@ -75,7 +74,7 @@ def _compute_gae_from_episodes( self, *, episodes: Optional[List[EpisodeType]] = None, - ) -> Tuple[Optional[MultiAgentBatch], Optional[List[EpisodeType]]]: + ) -> Tuple[Optional[Dict[str, Any]], Optional[List[EpisodeType]]]: """Computes GAE advantages (and value targets) given a list of episodes. Note that the episodes may be SingleAgent- or MultiAgentEpisodes and may be @@ -118,11 +117,6 @@ def _compute_gae_from_episodes( episodes=episodes, shared_data={}, ) - # TODO (sven): Try to not require MultiAgentBatch anymore. - batch_for_vf = MultiAgentBatch( - {mid: SampleBatch(v) for mid, v in batch_for_vf.items()}, - env_steps=sum(len(e) for e in episodes), - ) # Perform the value model's forward pass. vf_preds = convert_to_numpy(self._compute_values(batch_for_vf)) @@ -143,14 +137,16 @@ def _compute_gae_from_episodes( module_value_targets = compute_value_targets( values=module_vf_preds, rewards=unpad_data_if_necessary( - episode_lens_plus_1, batch_for_vf[module_id][Columns.REWARDS] + episode_lens_plus_1, + convert_to_numpy(batch_for_vf[module_id][Columns.REWARDS]), ), terminateds=unpad_data_if_necessary( episode_lens_plus_1, - batch_for_vf[module_id][Columns.TERMINATEDS], + convert_to_numpy(batch_for_vf[module_id][Columns.TERMINATEDS]), ), truncateds=unpad_data_if_necessary( - episode_lens_plus_1, batch_for_vf[module_id][Columns.TRUNCATEDS] + episode_lens_plus_1, + convert_to_numpy(batch_for_vf[module_id][Columns.TRUNCATEDS]), ), gamma=self.config.gamma, lambda_=self.config.lambda_, @@ -254,9 +250,7 @@ def _compute_values( tensors. """ return { - module_id: self.module[module_id]._compute_values( - module_batch, self._device - ) - for module_id, module_batch in batch_for_vf.policy_batches.items() + module_id: self.module[module_id].unwrapped()._compute_values(module_batch) + for module_id, module_batch in batch_for_vf.items() if self.should_module_be_updated(module_id, batch_for_vf) } diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py index f7c89f167f8b..1794c24bb5ba 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_env_runner.py @@ -10,11 +10,7 @@ from ray.rllib.core.learner.learner import DEFAULT_OPTIMIZER, LR_KEY from ray.rllib.utils.metrics import LEARNER_RESULTS -from ray.rllib.utils.test_utils import ( - check, - check_train_results_new_api_stack, - framework_iterator, -) +from ray.rllib.utils.test_utils import check, check_train_results_new_api_stack def get_model_config(framework, lstm=False): @@ -97,43 +93,42 @@ def test_ppo_compilation_and_schedule_mixins(self): num_iterations = 2 - for fw in framework_iterator(config, frameworks=("torch", "tf2")): - # TODO (Kourosh) Bring back "FrozenLake-v1" - for env in [ - # "CliffWalking-v0", - "CartPole-v1", - "Pendulum-v1", - ]: # "ALE/Breakout-v5"]: - print("Env={}".format(env)) - for lstm in [False]: - print("LSTM={}".format(lstm)) - config.rl_module( - model_config_dict=get_model_config(fw, lstm=lstm) - ).framework(eager_tracing=False) - - algo = config.build(env=env) - # TODO: Maybe add an API to get the Learner(s) instances within - # a learner group, remote or not. - learner = algo.learner_group._learner - optim = learner.get_optimizer() - # Check initial LR directly set in optimizer vs the first (ts=0) - # value from the schedule. - lr = optim.param_groups[0]["lr"] if fw == "torch" else optim.lr - check(lr, config.lr[0][1]) - - # Check current entropy coeff value using the respective Scheduler. - entropy_coeff = learner.entropy_coeff_schedulers_per_module[ - DEFAULT_MODULE_ID - ].get_current_value() - check(entropy_coeff, 0.1) - - for i in range(num_iterations): - results = algo.train() - check_train_results_new_api_stack(results) - print(results) - - # algo.evaluate() - algo.stop() + # TODO (Kourosh) Bring back "FrozenLake-v1" + for env in [ + # "CliffWalking-v0", + "CartPole-v1", + "Pendulum-v1", + ]: # "ALE/Breakout-v5"]: + print("Env={}".format(env)) + for lstm in [False]: + print("LSTM={}".format(lstm)) + config.rl_module( + model_config_dict=get_model_config("torch", lstm=lstm) + ).framework(eager_tracing=False) + + algo = config.build(env=env) + # TODO: Maybe add an API to get the Learner(s) instances within + # a learner group, remote or not. + learner = algo.learner_group._learner + optim = learner.get_optimizer() + # Check initial LR directly set in optimizer vs the first (ts=0) + # value from the schedule. + lr = optim.param_groups[0]["lr"] + check(lr, config.lr[0][1]) + + # Check current entropy coeff value using the respective Scheduler. + entropy_coeff = learner.entropy_coeff_schedulers_per_module[ + DEFAULT_MODULE_ID + ].get_current_value() + check(entropy_coeff, 0.1) + + for i in range(num_iterations): + results = algo.train() + check_train_results_new_api_stack(results) + print(results) + + # algo.evaluate() + algo.stop() if __name__ == "__main__": diff --git a/rllib/algorithms/ppo/torch/ppo_torch_learner.py b/rllib/algorithms/ppo/torch/ppo_torch_learner.py index a7d5ef9ac4f7..afc0e15b42c3 100644 --- a/rllib/algorithms/ppo/torch/ppo_torch_learner.py +++ b/rllib/algorithms/ppo/torch/ppo_torch_learner.py @@ -46,11 +46,11 @@ def compute_loss_for_module( # and for PPO's batched value function (and bootstrap value) computations, # for which we add an additional (artificial) timestep to each episode to # simplify the actual computation. - if "loss_mask" in batch: - num_valid = torch.sum(batch["loss_mask"]) + if Columns.LOSS_MASK in batch: + num_valid = torch.sum(batch[Columns.LOSS_MASK]) def possibly_masked_mean(data_): - return torch.sum(data_[batch["loss_mask"]]) / num_valid + return torch.sum(data_[batch[Columns.LOSS_MASK]]) / num_valid else: possibly_masked_mean = torch.mean @@ -152,7 +152,7 @@ def additional_update_for_module( # Update KL coefficient. if config.use_kl_loss: - assert sampled_kl_values, "Sampled KL values are empty." + assert module_id in sampled_kl_values, "Sampled KL values are empty." sampled_kl = sampled_kl_values[module_id] curr_var = self.curr_kl_coeffs_per_module[module_id] if sampled_kl > 2.0 * config.kl_target: diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py index 78c041878d1e..36f85e762c12 100644 --- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py +++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py @@ -9,7 +9,6 @@ from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.nested_dict import NestedDict -from ray.rllib.utils.torch_utils import convert_to_torch_tensor torch, nn = try_import_torch() @@ -112,13 +111,11 @@ def _forward_train(self, batch: NestedDict) -> Dict[str, Any]: return output + # TODO (sven): Try to move entire GAE computation into PPO's loss function (similar + # to IMPALA's v-trace architecture). This would also get rid of the second + # Connector pass currently necessary. @override(PPORLModule) - def _compute_values(self, batch, device=None): - infos = batch.pop(Columns.INFOS, None) - batch = convert_to_torch_tensor(batch, device=device) - if infos is not None: - batch[Columns.INFOS] = infos - + def _compute_values(self, batch): # Separate vf-encoder. if hasattr(self.encoder, "critic_encoder"): if self.is_stateful(): diff --git a/rllib/connectors/common/batch_individual_items.py b/rllib/connectors/common/batch_individual_items.py index 9b5460b4cb49..b095d4d77a7a 100644 --- a/rllib/connectors/common/batch_individual_items.py +++ b/rllib/connectors/common/batch_individual_items.py @@ -1,17 +1,39 @@ from typing import Any, List, Optional +import gymnasium as gym + from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.core import DEFAULT_MODULE_ID from ray.rllib.core.columns import Columns from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule from ray.rllib.core.rl_module.rl_module import RLModule -from ray.rllib.env.multi_agent_episode import MultiAgentEpisode from ray.rllib.utils.annotations import override from ray.rllib.utils.spaces.space_utils import batch from ray.rllib.utils.typing import EpisodeType class BatchIndividualItems(ConnectorV2): + def __init__( + self, + input_observation_space: Optional[gym.Space] = None, + input_action_space: Optional[gym.Space] = None, + *, + multi_agent: bool = False, + **kwargs, + ): + """Initializes a BatchIndividualItems instance. + + Args: + multi_agent: Whether this is a connector operating on a multi-agent + observation space mapping AgentIDs to individual agents' observations. + """ + super().__init__( + input_observation_space=input_observation_space, + input_action_space=input_action_space, + **kwargs, + ) + self._multi_agent = multi_agent + @override(ConnectorV2) def __call__( self, @@ -23,7 +45,6 @@ def __call__( shared_data: Optional[dict] = None, **kwargs, ) -> Any: - is_multi_agent = isinstance(episodes[0], MultiAgentEpisode) is_marl_module = isinstance(rl_module, MultiAgentRLModule) # Convert lists of individual items into properly batched data. @@ -33,8 +54,10 @@ def __call__( # to a batch structure of: # [module_id] -> [col0] -> [list of items] if is_marl_module and column in rl_module: - # assert is_multi_agent - # TODO (simon, sven): Check, if we need for other cases this check. + # Case, in which a column has already been properly batched before this + # connector piece is called. + if not self._multi_agent: + continue # If MA Off-Policy and independent sampling we need to overcome # this check. module_data = column_data @@ -56,7 +79,7 @@ def __call__( # Single-agent case: There is a dict under `column` mapping # `eps_id` to lists of items: # Sort by eps_id, concat all these lists, then batch. - elif not is_multi_agent: + elif not self._multi_agent: # TODO: only really need this in non-Learner connector pipeline memorized_map_structure = [] list_to_be_batched = [] diff --git a/rllib/connectors/common/numpy_to_tensor.py b/rllib/connectors/common/numpy_to_tensor.py index 8a75c85b57b1..b3d2c44d5f0a 100644 --- a/rllib/connectors/common/numpy_to_tensor.py +++ b/rllib/connectors/common/numpy_to_tensor.py @@ -5,10 +5,10 @@ from ray.rllib.connectors.connector_v2 import ConnectorV2 from ray.rllib.core import DEFAULT_MODULE_ID from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule from ray.rllib.core.rl_module.rl_module import RLModule -from ray.rllib.env.multi_agent_episode import MultiAgentEpisode from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import convert_to_tensor +from ray.rllib.utils.torch_utils import convert_to_torch_tensor from ray.rllib.utils.typing import EpisodeType @@ -25,14 +25,30 @@ def __init__( input_action_space: Optional[gym.Space] = None, *, as_learner_connector: bool = False, + pin_mempory: Optional[bool] = None, + device: Optional[str] = None, **kwargs, ): + """Initializes a NumpyToTensor instance. + + Args: + as_learner_connector: Whether this ConnectorV2 piece is used inside a + LearnerConnectorPipeline or not. + pin_mempory: Whether to pin memory when creating (torch) tensors. + If None (default), pins memory if `as_learner_connector` is True, + otherwise doesn't pin memory. + **kwargs: + """ super().__init__( input_observation_space=input_observation_space, input_action_space=input_action_space, **kwargs, ) self._as_learner_connector = as_learner_connector + self._pin_memory = ( + pin_mempory if pin_mempory is not None else self._as_learner_connector + ) + self._device = device @override(ConnectorV2) def __call__( @@ -45,20 +61,29 @@ def __call__( shared_data: Optional[dict] = None, **kwargs, ) -> Any: - is_multi_agent = isinstance(episodes[0], MultiAgentEpisode) - - if not is_multi_agent: + is_single_agent = False + is_marl_module = isinstance(rl_module, MultiAgentRLModule) + # `data` already a ModuleID to batch mapping format. + if not (is_marl_module and all(c in rl_module._rl_modules for c in data)): + is_single_agent = True data = {DEFAULT_MODULE_ID: data} # TODO (sven): Support specifying a device (e.g. GPU). for module_id, module_data in data.copy().items(): infos = module_data.pop(Columns.INFOS, None) - module_data = convert_to_tensor(module_data, framework=rl_module.framework) + if rl_module.framework == "torch": + module_data = convert_to_torch_tensor( + module_data, pin_memory=self._pin_memory, device=self._device + ) + else: + raise ValueError( + "`NumpyToTensor`does NOT support frameworks other than torch!" + ) if infos is not None: module_data[Columns.INFOS] = infos - # Early out if not multi-agent AND not learner connector (which - # does always operate on a MARLModule). - if not is_multi_agent and not self._as_learner_connector: + # Early out with data under(!) `DEFAULT_MODULE_ID`, b/c we are in plain + # single-agent mode. + if is_single_agent: return module_data data[module_id] = module_data diff --git a/rllib/connectors/learner/__init__.py b/rllib/connectors/learner/__init__.py index 33ea3c80c4f6..9e0e0c772b6f 100644 --- a/rllib/connectors/learner/__init__.py +++ b/rllib/connectors/learner/__init__.py @@ -13,6 +13,9 @@ from ray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batch import ( # noqa AddNextObservationsFromEpisodesToTrainBatch, ) +from ray.rllib.connectors.learner.add_one_ts_to_episodes_and_truncate import ( + AddOneTsToEpisodesAndTruncate, +) from ray.rllib.connectors.learner.learner_connector_pipeline import ( LearnerConnectorPipeline, ) @@ -21,6 +24,7 @@ "AddColumnsFromEpisodesToTrainBatch", "AddNextObservationsFromEpisodesToTrainBatch", "AddObservationsFromEpisodesToBatch", + "AddOneTsToEpisodesAndTruncate", "AddStatesFromEpisodesToBatch", "AgentToModuleMapping", "BatchIndividualItems", diff --git a/rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py b/rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py new file mode 100644 index 000000000000..9e1cd68a88a3 --- /dev/null +++ b/rllib/connectors/learner/add_one_ts_to_episodes_and_truncate.py @@ -0,0 +1,131 @@ +from typing import Any, List, Optional + +from ray.rllib.connectors.connector_v2 import ConnectorV2 +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.utils.annotations import override +from ray.rllib.utils.postprocessing.episodes import add_one_ts_to_episodes_and_truncate +from ray.rllib.utils.typing import EpisodeType + + +class AddOneTsToEpisodesAndTruncate(ConnectorV2): + """Adds an artificial timestep to all incoming episodes at the end. + + In detail: The last observations, infos, actions, and all `extra_model_outputs` + will be duplicated and appended to each episode's data. An extra 0.0 reward + will be appended to the episode's rewards. The episode's timestep will be + increased by 1. Also, adds the truncated=True flag to each episode if the + episode is not already done (terminated or truncated). + + Useful for value function bootstrapping, where it is required to compute a + forward pass for the very last timestep within the episode, + i.e. using the following input dict: { + obs=[final obs], + state=[final state output], + prev. reward=[final reward], + etc.. + } + + .. testcode:: + + from ray.rllib.connectors.learner import AddOneTsToEpisodesAndTruncate + from ray.rllib.env.single_agent_episode import SingleAgentEpisode + from ray.rllib.utils.test_utils import check + + # Create 2 episodes (both to be extended by one timestep). + episode1 = SingleAgentEpisode( + observations=[0, 1, 2], + actions=[0, 1], + rewards=[0.0, 1.0], + terminated=False, + truncated=False, + len_lookback_buffer=0, + ).finalize() + check(len(episode1), 2) + check(episode1.is_truncated, False) + + episode2 = SingleAgentEpisode( + observations=[0, 1, 2, 3, 4, 5], + actions=[0, 1, 2, 3, 4], + rewards=[0.0, 1.0, 2.0, 3.0, 4.0], + terminated=True, # a terminated episode + truncated=False, + len_lookback_buffer=0, + ).finalize() + check(len(episode2), 5) + check(episode2.is_truncated, False) + check(episode2.is_terminated, True) + + # Create an instance of this class. + connector = AddOneTsToEpisodesAndTruncate() + + # Call the connector. + shared_data = {} + _ = connector( + rl_module=None, # Connector used here does not require RLModule. + data={}, + episodes=[episode1, episode2], + shared_data=shared_data, + ) + # Check on the episodes. Both of them should now be 1 timestep longer. + check(len(episode1), 3) + check(episode1.is_truncated, True) + check(len(episode2), 6) + check(episode2.is_truncated, False) + check(episode2.is_terminated, True) + """ + + @override(ConnectorV2) + def __call__( + self, + *, + rl_module: RLModule, + data: Optional[Any], + episodes: List[EpisodeType], + explore: Optional[bool] = None, + shared_data: Optional[dict] = None, + **kwargs, + ) -> Any: + # Build the loss mask to make sure the extra added timesteps do not influence + # the final loss and fix the terminateds and truncateds in the batch. + + # For proper v-trace execution, the rules must be as follows: + # Legend: + # T: terminal=True + # R: truncated=True + # B0: bootstrap with value 0 (also: terminal=True) + # Bx: bootstrap with some vf-computed value (also: terminal=True) + + # batch: - - - - - - - T B0- - - - - R Bx- - - - R Bx + # mask : t t t t t t t t f t t t t t t f t t t t t f + + shared_data["_sa_episodes_lengths"] = {} + for sa_episode in self.single_agent_episode_iterator( + episodes, agents_that_stepped_only=False + ): + len_ = len(sa_episode) + + # Extend all episodes by one ts. + add_one_ts_to_episodes_and_truncate([sa_episode]) + + loss_mask = [True for _ in range(len_)] + [False] + self.add_n_batch_items( + data, + Columns.LOSS_MASK, + loss_mask, + len_ + 1, + sa_episode, + ) + + terminateds = ( + [False for _ in range(len_ - 1)] + [sa_episode.is_terminated] + [True] + ) + self.add_n_batch_items( + data, + Columns.TERMINATEDS, + terminateds, + len_ + 1, + sa_episode, + ) + + return data diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index b71877fdf892..757929e149c5 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -8,6 +8,7 @@ from typing import ( Any, Callable, + Container, Dict, List, Hashable, @@ -19,6 +20,8 @@ Union, ) +import tree # pip install dm_tree + import ray from ray.rllib.connectors.learner.learner_connector_pipeline import ( LearnerConnectorPipeline, @@ -30,6 +33,7 @@ ) from ray.rllib.core.rl_module.rl_module import RLModule, SingleAgentRLModuleSpec from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch +from ray.rllib.utils import force_list from ray.rllib.utils.annotations import ( OverrideToImplementCustomLogic, OverrideToImplementCustomLogic_CallToSuperRecommended, @@ -52,7 +56,6 @@ MiniBatchDummyIterator, MiniBatchCyclicIterator, ) -from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.numpy import convert_to_numpy from ray.rllib.utils.schedules.scheduler import Scheduler from ray.rllib.utils.serialization import serialize_type @@ -313,6 +316,7 @@ def build(self) -> None: self._learner_connector = self.config.build_learner_connector( input_observation_space=None, input_action_space=None, + device=self._device, ) # Build the module to be trained by this learner. @@ -811,8 +815,8 @@ def should_module_be_updated(self, module_id, multi_agent_batch=None): def compute_loss( self, *, - fwd_out: Union[MultiAgentBatch, NestedDict], - batch: Union[MultiAgentBatch, NestedDict], + fwd_out: Dict[str, Any], + batch: Dict[str, Any], ) -> Union[TensorType, Dict[str, Any]]: """Computes the loss for the module being optimized. @@ -867,7 +871,7 @@ def compute_loss_for_module( *, module_id: ModuleID, config: Optional["AlgorithmConfig"] = None, - batch: NestedDict, + batch: Dict[str, Any], fwd_out: Dict[str, TensorType], ) -> TensorType: """Computes the loss for a single module. @@ -1160,7 +1164,7 @@ def update_from_episodes( @abc.abstractmethod def _update( self, - batch: NestedDict, + batch: Dict[str, Any], **kwargs, ) -> Tuple[Any, Any, Any]: """Contains all logic for an in-graph/traceable update step. @@ -1171,7 +1175,8 @@ def _update( with all the individual results. Args: - batch: The train batch already converted in to a (tensor) NestedDict. + batch: The train batch already converted to a Dict mapping str to (possibly + nested) tensors. kwargs: Forward compatibility kwargs. Returns: @@ -1196,7 +1201,8 @@ def set_state(self, state: Dict[str, Any]) -> None: """ self._check_is_built() - module_state = state.get("module_state") + # TODO (sven): Deprecate old state keys and create constants for new ones. + module_state = state.get("rl_module", state.get("module_state")) # TODO: once we figure out the optimizer format, we can set/get the state if module_state is None: raise ValueError( @@ -1204,7 +1210,8 @@ def set_state(self, state: Dict[str, Any]) -> None: ) self.set_module_state(module_state) - optimizer_state = state.get("optimizer_state") + # TODO (sven): Deprecate old state keys and create constants for new ones. + optimizer_state = state.get("optimizer", state.get("optimizer_state")) if optimizer_state is None: raise ValueError( "state must have a key 'optimizer_state' for the optimizer weights" @@ -1215,19 +1222,43 @@ def set_state(self, state: Dict[str, Any]) -> None: # If not provided in state (None), all Modules will be trained by default. self.config.multi_agent(policies_to_train=state.get("modules_to_train")) - def get_state(self) -> Dict[str, Any]: - """Get the state of the learner. + def get_state( + self, + components: Optional[Union[str, List[str]]] = None, + *, + inference_only: bool = False, + module_ids: Optional[Container[ModuleID]] = None, + ) -> Dict[str, Any]: + """Get (select components of) the state of this Learner. + + Args: + components: Either None (return all components) or one of "rl_module", + "optimizer", or "modules_to_be_updated", or a list of either of these. + inference_only: Whether to return the inference-only weight set of the + underlying RLModule. Note that this setting only has an effect if + components is None or the string "rl_module" is in components. + module_ids: Optional container of ModuleIDs to be returned only within the + state dict. If None (default), all module IDs' weights are returned. Returns: - The state of the optimizer and module. - + The state (or select components thereof) of this Learner. """ self._check_is_built() - return { - "module_state": self.get_module_state(), - "optimizer_state": self.get_optimizer_state(), - "modules_to_train": self.config.policies_to_train, - } + components = force_list(components) or [ + "rl_module", + "optimizer", + "modules_to_be_updated", + ] + state = {} + if "rl_module" in components: + state["rl_module"] = self.get_module_state( + inference_only=inference_only, module_ids=module_ids + ) + if "optimizer" in components: + state["optimizer"] = self.get_optimizer_state() + if "modules_to_be_updated" in components: + state["modules_to_be_updated"] = self.config.policies_to_train + return state def set_optimizer_state(self, state: Dict[str, Any]) -> None: """Sets the state of all optimizers currently registered in this Learner. @@ -1263,14 +1294,25 @@ def _update_from_batch_or_episodes( self._check_is_built() + # Resolve batch/episodes being ray object refs (instead of + # actual batch/episodes objects). + if isinstance(batch, ray.ObjectRef): + batch = ray.get(batch) + if isinstance(episodes, ray.ObjectRef) or ( + isinstance(episodes, list) and isinstance(episodes[0], ray.ObjectRef) + ): + episodes = ray.get(episodes) + episodes = tree.flatten(episodes) + # Call the learner connector. + shared_data = {} if self._learner_connector is not None and episodes is not None: # Call the learner connector pipeline. batch = self._learner_connector( rl_module=self.module, data=batch if batch is not None else {}, episodes=episodes, - shared_data={}, + shared_data=shared_data, ) # Convert to a batch. # TODO (sven): Try to not require MultiAgentBatch anymore. @@ -1300,7 +1342,7 @@ def _update_from_batch_or_episodes( # Log all timesteps (env, agent, modules) based on given episodes. if self._learner_connector is not None and episodes is not None: - self._log_steps_trained_metrics(episodes, batch) + self._log_steps_trained_metrics(episodes, batch, shared_data) # TODO (sven): Possibly remove this if-else block entirely. We might be in a # world soon where we always learn from episodes, never from an incoming batch. else: @@ -1340,15 +1382,15 @@ def _update_from_batch_or_episodes( # Convert input batch into a tensor batch (MultiAgentBatch) on the correct # device (e.g. GPU). We move the batch already here to avoid having to move # every single minibatch that is created in the `batch_iter` below. - batch = self._convert_batch_type(batch) + if self._learner_connector is None: + batch = self._convert_batch_type(batch) batch = self._set_slicing_by_batch_id(batch, value=True) for tensor_minibatch in batch_iter(batch, minibatch_size, num_iters): # Make the actual in-graph/traced `_update` call. This should return # all tensor values (no numpy). - nested_tensor_minibatch = NestedDict(tensor_minibatch.policy_batches) fwd_out, loss_per_module, tensor_metrics = self._update( - nested_tensor_minibatch + tensor_minibatch.policy_batches ) # Convert logged tensor metrics (logged during tensor-mode of MetricsLogger) @@ -1658,10 +1700,11 @@ def _set_optimizer_lr(optimizer: Optimizer, lr: float) -> None: def _get_clip_function() -> Callable: """Returns the gradient clipping function to use, given the framework.""" - def _log_steps_trained_metrics(self, episodes, batch): + def _log_steps_trained_metrics(self, episodes, batch, shared_data): # Logs this iteration's steps trained, based on given `episodes`. env_steps = sum(len(e) for e in episodes) log_dict = defaultdict(dict) + orig_lengths = shared_data.get("_sa_episodes_lengths", {}) for sa_episode in self._learner_connector.single_agent_episode_iterator( episodes, agents_that_stepped_only=False ): @@ -1674,8 +1717,11 @@ def _log_steps_trained_metrics(self, episodes, batch): if mid != ALL_MODULES and mid not in batch.policy_batches: continue - _len = len(sa_episode) - + _len = ( + orig_lengths[sa_episode.id_] + if sa_episode.id_ in orig_lengths + else len(sa_episode) + ) # TODO (sven): Decide, whether agent_ids should be part of LEARNER_RESULTS. # Currently and historically, only ModuleID keys and ALL_MODULES were used # and expected. Does it make sense to include e.g. agent steps trained? diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index 2b066f981943..639ff1c3e1d6 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -4,19 +4,20 @@ from typing import ( Any, Callable, + Container, Dict, List, Optional, Set, Type, + TYPE_CHECKING, Union, ) -import uuid import tree # pip install dm_tree import ray -from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray import ObjectRef from ray.rllib.core.learner.learner import Learner from ray.rllib.core.rl_module.rl_module import ( SingleAgentRLModuleSpec, @@ -34,6 +35,7 @@ from ray.rllib.utils.minibatch_utils import ( ShardBatchIterator, ShardEpisodesIterator, + ShardObjectRefIterator, ) from ray.rllib.utils.numpy import convert_to_numpy from ray.rllib.utils.typing import ( @@ -46,6 +48,9 @@ from ray.tune.utils.file_transfer import sync_dir_between_nodes from ray.util.annotations import PublicAPI +if TYPE_CHECKING: + from ray.rllib.algorithms.algorithm_config import AlgorithmConfig + def _get_backend_config(learner_class: Type[Learner]) -> str: if learner_class.framework == "torch": @@ -76,11 +81,8 @@ class LearnerGroup: def __init__( self, *, - config: AlgorithmConfig = None, # TODO (sven): Make this arg mandatory. + config: "AlgorithmConfig", module_spec: Optional[RLModuleSpec] = None, - max_queue_len: int = 20, - # Deprecated args. - learner_spec=None, ): """Initializes a LearnerGroup instance. @@ -97,23 +99,7 @@ def __init__( the specifics for your RLModule to be used in each Learner. module_spec: If not already specified in `config`, a separate overriding RLModuleSpec may be provided via this argument. - max_queue_len: The maximum number of batches to queue up if doing - async_update. If the queue is full it will evict the oldest batch first. """ - if learner_spec is not None: - deprecation_warning( - old="LearnerGroup(learner_spec=...)", - new="config = AlgorithmConfig().[resources|training|rl_module](...); " - "LearnerGroup(config=config)", - error=True, - ) - if config is None: - raise ValueError( - "LearnerGroup constructor must be called with a `config` arg! " - "Pass in a `ray.rllib.algorithms.algorithm_config::AlgorithmConfig` " - "object with the proper settings configured." - ) - # scaling_config = learner_spec.learner_group_scaling_config self.config = config @@ -187,7 +173,10 @@ def __init__( # in-flight. Used for keeping trakc of and grouping together the results of # requests that were sent to the workers at the same time. self._update_request_tags = Counter() + self._update_request_tag = 0 + self._update_request_results = {} self._additional_update_request_tags = Counter() + self._additional_update_request_tag = 0 # A special MetricsLogger object (not exposed to the user) for reducing # the n results dicts returned by our n Learner workers in case we are on @@ -197,6 +186,7 @@ def __init__( self._metrics_logger_old_and_hybrid_stack = MetricsLogger() # TODO (sven): Replace this with call to `self.metrics.peek()`? + # Currently LearnerGroup does not have a metrics object. def get_stats(self) -> Dict[str, Any]: """Returns the current stats for the input queue for this learner group.""" return { @@ -222,13 +212,16 @@ def update_from_batch( *, timesteps: Optional[Dict[str, Any]] = None, async_update: bool = False, - # TODO (sven): Deprecate the following args. They should be extracted from + return_state: bool = False, + # TODO (sven): Deprecate the following args. They should be extracted from the # self.config of those specific algorithms that actually require these # settings. minibatch_size: Optional[int] = None, num_iters: int = 1, # Already deprecated args. reduce_fn=DEPRECATED_VALUE, + # User kwargs. + **kwargs, ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]: """Performs gradient based update(s) on the Learner(s), based on given batch. @@ -240,6 +233,12 @@ def update_from_batch( sent asynchronously. If True, will return NOT the results from the update on the given data, but all results from prior asynchronous update requests that have not been returned thus far. + return_state: Whether to include one of the Learner worker's state from + after the update step in the returned results dict (under the + `_rl_module_state_after_update` key). Note that after an update, all + Learner workers' states should be identical, so we use the first + Learner's state here. Useful for avoiding an extra `get_weights()` call, + e.g. for synchronizing EnvRunner weights. minibatch_size: The minibatch size to use for the update. num_iters: The number of complete passes over all the sub-batches in the input multi-agent batch. @@ -269,8 +268,10 @@ def update_from_batch( batch=batch, timesteps=timesteps, async_update=async_update, + return_state=return_state, minibatch_size=minibatch_size, num_iters=num_iters, + **kwargs, ) def update_from_episodes( @@ -279,13 +280,16 @@ def update_from_episodes( *, timesteps: Optional[Dict[str, Any]] = None, async_update: bool = False, - # TODO (sven): Deprecate the following args. They should be extracted from + return_state: bool = False, + # TODO (sven): Deprecate the following args. They should be extracted from the # self.config of those specific algorithms that actually require these # settings. minibatch_size: Optional[int] = None, num_iters: int = 1, # Already deprecated args. reduce_fn=DEPRECATED_VALUE, + # User kwargs. + **kwargs, ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]: """Performs gradient based update(s) on the Learner(s), based on given episodes. @@ -297,6 +301,12 @@ def update_from_episodes( sent asynchronously. If True, will return NOT the results from the update on the given data, but all results from prior asynchronous update requests that have not been returned thus far. + return_state: Whether to include one of the Learner worker's state from + after the update step in the returned results dict (under the + `_rl_module_state_after_update` key). Note that after an update, all + Learner workers' states should be identical, so we use the first + Learner's state here. Useful for avoiding an extra `get_weights()` call, + e.g. for synchronizing EnvRunner weights. minibatch_size: The minibatch size to use for the update. num_iters: The number of complete passes over all the sub-batches in the input multi-agent batch. @@ -327,8 +337,10 @@ def update_from_episodes( episodes=episodes, timesteps=timesteps, async_update=async_update, + return_state=return_state, minibatch_size=minibatch_size, num_iters=num_iters, + **kwargs, ) def _update( @@ -338,49 +350,64 @@ def _update( episodes: Optional[List[EpisodeType]] = None, timesteps: Optional[Dict[str, Any]] = None, async_update: bool = False, + return_state: bool = False, minibatch_size: Optional[int] = None, num_iters: int = 1, + **kwargs, ) -> Union[Dict[str, Any], List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Define function to be called on all Learner actors (or the local learner). def _learner_update( - learner: Learner, + _learner: Learner, + *, _batch_shard=None, _episodes_shard=None, _timesteps=None, + _return_state=False, _min_total_mini_batches=0, + **_kwargs, ): if _batch_shard is not None: - return learner.update_from_batch( + result = _learner.update_from_batch( batch=_batch_shard, timesteps=_timesteps, minibatch_size=minibatch_size, num_iters=num_iters, + **_kwargs, ) else: - return learner.update_from_episodes( + result = _learner.update_from_episodes( episodes=_episodes_shard, timesteps=_timesteps, minibatch_size=minibatch_size, num_iters=num_iters, min_total_mini_batches=_min_total_mini_batches, + **_kwargs, ) + if _return_state: + result["_rl_module_state_after_update"] = _learner.get_state( + components="rl_module", inference_only=True + )["rl_module"] + + return result # Local Learner worker: Don't shard batch/episodes, just run data as-is through # this Learner. if self.is_local: if async_update: raise ValueError( - "Cannot call `update_from_batch(update_async=True)` when running in" + "Cannot call `update_from_batch(async_update=True)` when running in" " local mode! Try setting `config.num_learners > 0`." ) results = [ _learner_update( - learner=self._learner, + _learner=self._learner, _batch_shard=batch, _episodes_shard=episodes, _timesteps=timesteps, + _return_state=return_state, + **kwargs, ) ] # One or more remote Learners: Shard batch/episodes into equal pieces (roughly @@ -392,12 +419,33 @@ def _learner_update( # the relationship of the different agents' timesteps to each other. # Thus, in case the algorithm requires agent-synchronized data (aka. # "lockstep"), the `ShardBatchIterator` should not be used. - if episodes is None: + # Then again, we might move into a world where Learner always + # receives Episodes, never batches. + if batch is not None: partials = [ partial( - _learner_update, _batch_shard=batch_shard, _timesteps=timesteps + _learner_update, + _batch_shard=batch_shard, + _return_state=(return_state and i == 0), + _timesteps=timesteps, + **kwargs, + ) + for i, batch_shard in enumerate( + ShardBatchIterator(batch, len(self._workers)) + ) + ] + elif isinstance(episodes, list) and isinstance(episodes[0], ObjectRef): + partials = [ + partial( + _learner_update, + _episodes_shard=episodes_shard, + _timesteps=timesteps, + _return_state=(return_state and i == 0), + **kwargs, + ) + for i, episodes_shard in enumerate( + ShardObjectRefIterator(episodes, len(self._workers)) ) - for batch_shard in ShardBatchIterator(batch, len(self._workers)) ] # Single- or MultiAgentEpisodes: Shard into equal pieces (only roughly equal # in case of multi-agent). @@ -435,25 +483,43 @@ def _learner_update( _learner_update, _episodes_shard=eps_shard, _timesteps=timesteps, + _return_state=(return_state and i == 0), _min_total_mini_batches=min_total_mini_batches, + **kwargs, ) - for eps_shard in eps_shards + for i, eps_shard in enumerate(eps_shards) ] if async_update: # Retrieve all ready results (kicked off by prior calls to this method). - results = None - if self._update_request_tags: - results = self._worker_manager.fetch_ready_async_reqs( - tags=list(self._update_request_tags) + tags_to_get = [] + for tag in self._update_request_tags.keys(): + result = self._worker_manager.fetch_ready_async_reqs( + tags=[str(tag)], timeout_seconds=0.0 ) - - update_tag = str(uuid.uuid4()) - + if tag not in self._update_request_results: + self._update_request_results[tag] = result + else: + for r in result: + self._update_request_results[tag].add_result( + r.actor_id, r.result_or_error, tag + ) + + # Still not done with this `tag` -> skip out early. + if ( + self._update_request_tags[tag] + > len(self._update_request_results[tag].result_or_errors) + > 0 + ): + break + tags_to_get.append(tag) + + # Send out new request(s), if there is still capacity on the actors. + update_tag = self._update_request_tag + self._update_request_tag += 1 num_sent_requests = self._worker_manager.foreach_actor_async( - partials, tag=update_tag + partials, tag=str(update_tag) ) - if num_sent_requests: self._update_request_tags[update_tag] = num_sent_requests @@ -461,24 +527,36 @@ def _learner_update( if num_sent_requests != len(self._workers): # assert num_sent_requests == 0, num_sent_requests factor = 1 - (num_sent_requests / len(self._workers)) + # Batch: Measure its length. if episodes is None: - self._ts_dropped += factor * len(batch) + dropped = len(batch) + # List of Ray ObjectRefs (each object ref is a list of episodes of + # total len=`rollout_fragment_length * num_envs_per_env_runner`) + elif isinstance(episodes[0], ObjectRef): + dropped = ( + len(episodes) + * self.config.get_rollout_fragment_length() + * self.config.num_envs_per_env_runner + ) else: - self._ts_dropped += factor * sum(len(e) for e in episodes) + dropped = sum(len(e) for e in episodes) + + self._ts_dropped += factor * dropped + # NOTE: There is a strong assumption here that the requests launched to # learner workers will return at the same time, since they have a # barrier inside for gradient aggregation. Therefore, results should be # a list of lists where each inner list should be the length of the # number of learner workers, if results from an non-blocking update are # ready. - results = self._get_async_results(results) + results = self._get_async_results(tags_to_get) else: results = self._get_results( self._worker_manager.foreach_actor(partials) ) - # If we are on the old or hybrid API stacks (no EnvRunners), we need to emulate + # If we are on the hybrid API stacks (no EnvRunners), we need to emulate # the old behavior of returning an already reduced dict (as if we had a # reduce_fn). if not self.config.enable_env_runner_and_connector_v2: @@ -508,7 +586,7 @@ def _get_results(self, results): raise result_or_error return processed_results - def _get_async_results(self, results): + def _get_async_results(self, tags_to_get): # results): """Get results from the worker manager and group them by tag. Returns: @@ -516,33 +594,37 @@ def _get_async_results(self, results): for same tags. """ - if results is None: - return [] + # if results is None: + # return [] unprocessed_results = defaultdict(list) - for result in results: - result_or_error = result.get() - if result.ok: - tag = result.tag - if not tag: - raise RuntimeError( - "Cannot call `LearnerGroup._get_async_results()` on untagged " - "async requests!" - ) - unprocessed_results[tag].append(result_or_error) + for tag in tags_to_get: + results = self._update_request_results[tag] + for result in results: + result_or_error = result.get() + if result.ok: + if result.tag is None: + raise RuntimeError( + "Cannot call `LearnerGroup._get_async_results()` on " + "untagged async requests!" + ) + tag = int(result.tag) + unprocessed_results[tag].append(result_or_error) + + if tag in self._update_request_tags: + self._update_request_tags[tag] -= 1 + if self._update_request_tags[tag] == 0: + del self._update_request_tags[tag] + del self._update_request_results[tag] + else: + assert False + assert tag in self._additional_update_request_tags + self._additional_update_request_tags[tag] -= 1 + if self._additional_update_request_tags[tag] == 0: + del self._additional_update_request_tags[tag] - if tag in self._update_request_tags: - self._update_request_tags[tag] -= 1 - if self._update_request_tags[tag] == 0: - del self._update_request_tags[tag] else: - assert tag in self._additional_update_request_tags - self._additional_update_request_tags[tag] -= 1 - if self._additional_update_request_tags[tag] == 0: - del self._additional_update_request_tags[tag] - - else: - raise result_or_error + raise result_or_error return list(unprocessed_results.values()) @@ -688,22 +770,50 @@ def set_weights(self, weights: Dict[str, Any]) -> None: # raise errors if any self._get_results(results_or_errors) - def get_state(self) -> Dict[str, Any]: + def get_state( + self, + components: Optional[Container[str]] = None, + *, + inference_only: bool = False, + module_ids: Container[ModuleID] = None, + ) -> Dict[str, Any]: """Get the states of this LearnerGroup. Contains the Learners' state (which should be the same across Learners) and some other information. + Args: + components: An optional list of string keys to be included in the + returned state. This might be useful, if getting certain components + of the state is expensive (e.g. reading/compiling the weights of a large + NN) and at the same time, these components are not required by the + caller. + inference_only: Return weights with workers that keep inference-only + modules. This is needed for algorithms in the new stack that + use inference-only modules. In this case only a part of the + parameters are synced to the workers. Default is False. + module_ids: Optional container of ModuleIDs to be returned only within the + state dict. If None (default), all module IDs' weights are returned. + Returns: The state dict mapping str keys to state information. """ if self.is_local: - learner_state = self._learner.get_state() + learner_state = self._learner.get_state( + components=components, + inference_only=inference_only, + module_ids=module_ids, + ) else: worker = self._worker_manager.healthy_actor_ids()[0] assert len(self._workers) == self._worker_manager.num_healthy_actors() results = self._worker_manager.foreach_actor( - lambda w: w.get_state(), remote_actor_ids=[worker] + lambda w: w.get_state( + components=components, + inference_only=inference_only, + module_ids=module_ids, + ), + remote_actor_ids=[worker], ) learner_state = self._get_results(results)[0] @@ -782,8 +892,8 @@ def save_state(self, path: str) -> None: worker_ip_addr, worker_temp_dir, self_ip_addr, path ) - # creating this function here instead of making it a member funciton - # becasue it uses the worker_temp_dir variable, and this can't + # Creating this function here instead of making it a member function + # because it uses the worker_temp_dir variable, and this can't # be passed in as an argument to foreach_actor def remove_dir(w): import shutil diff --git a/rllib/core/learner/torch/torch_learner.py b/rllib/core/learner/torch/torch_learner.py index 6dc586c12077..f06cef849d3b 100644 --- a/rllib/core/learner/torch/torch_learner.py +++ b/rllib/core/learner/torch/torch_learner.py @@ -32,7 +32,11 @@ ) from ray.rllib.utils.deprecation import deprecation_warning from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics import ALL_MODULES +from ray.rllib.utils.metrics import ( + ALL_MODULES, + NUM_TRAINABLE_PARAMETERS, + NUM_NON_TRAINABLE_PARAMETERS, +) from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.torch_utils import ( convert_to_torch_tensor, @@ -123,13 +127,21 @@ def _uncompiled_update( **kwargs, ): """Performs a single update given a batch of data.""" + # Activate tensor-mode on our MetricsLogger. + self.metrics.activate_tensor_mode() + fwd_out = self.module.forward_train(batch) loss_per_module = self.compute_loss(fwd_out=fwd_out, batch=batch) gradients = self.compute_gradients(loss_per_module) postprocessed_gradients = self.postprocess_gradients(gradients) self.apply_gradients(postprocessed_gradients) - return fwd_out, loss_per_module, self.metrics.deactivate_tensor_mode() + + # Deactivate tensor-mode on our MetricsLogger and collect the (tensor) + # results. + collected_tensor_metrics = self.metrics.deactivate_tensor_mode() + + return fwd_out, loss_per_module, collected_tensor_metrics @override(Learner) def compute_gradients( @@ -353,11 +365,38 @@ def build(self) -> None: self._make_modules_ddp_if_necessary() + # Log number of non-trainable and trainable parameters of our RLModule. + num_trainable_params = { + (mid, NUM_TRAINABLE_PARAMETERS): sum( + p.numel() for p in rlm.parameters() if p.requires_grad + ) + for mid, rlm in self.module._rl_modules.items() + if isinstance(rlm, TorchRLModule) + } + num_non_trainable_params = { + (mid, NUM_NON_TRAINABLE_PARAMETERS): sum( + p.numel() for p in rlm.parameters() if not p.requires_grad + ) + for mid, rlm in self.module._rl_modules.items() + if isinstance(rlm, TorchRLModule) + } + self.metrics.log_dict( + { + **{ + (ALL_MODULES, NUM_TRAINABLE_PARAMETERS): sum( + num_trainable_params.values() + ), + (ALL_MODULES, NUM_NON_TRAINABLE_PARAMETERS): sum( + num_non_trainable_params.values() + ), + }, + **num_trainable_params, + **num_non_trainable_params, + } + ) + @override(Learner) def _update(self, batch: NestedDict) -> Tuple[Any, Any, Any]: - # Activate tensor-mode on our MetricsLogger. - self.metrics.activate_tensor_mode() - # The first time we call _update after building the learner or # adding/removing models, we update with the uncompiled update method. # This makes it so that any variables that may be created during the first diff --git a/rllib/core/rl_module/marl_module.py b/rllib/core/rl_module/marl_module.py index e068c68ddfa7..177b2560fe1d 100644 --- a/rllib/core/rl_module/marl_module.py +++ b/rllib/core/rl_module/marl_module.py @@ -1,4 +1,5 @@ from dataclasses import dataclass, field +import logging import pathlib import pprint from typing import ( @@ -7,7 +8,6 @@ Dict, KeysView, List, - Mapping, Optional, Set, Type, @@ -32,8 +32,11 @@ from ray.rllib.utils.policy import validate_policy_id from ray.rllib.utils.serialization import serialize_type, deserialize_type from ray.rllib.utils.typing import ModuleID, T +from ray.util import log_once from ray.util.annotations import PublicAPI +logger = logging.getLogger("ray.rllib") + @PublicAPI(stability="alpha") class MultiAgentRLModule(RLModule): @@ -239,7 +242,7 @@ def _default_input_specs(self) -> SpecType: @override(RLModule) def _forward_train( self, batch: MultiAgentBatch, **kwargs - ) -> Union[Mapping[str, Any], Dict[ModuleID, Mapping[str, Any]]]: + ) -> Union[Dict[str, Any], Dict[ModuleID, Dict[str, Any]]]: """Runs the forward_train pass. TODO(avnishn, kourosh): Review type hints for forward methods. @@ -256,7 +259,7 @@ def _forward_train( @override(RLModule) def _forward_inference( self, batch: MultiAgentBatch, **kwargs - ) -> Union[Mapping[str, Any], Dict[ModuleID, Mapping[str, Any]]]: + ) -> Union[Dict[str, Any], Dict[ModuleID, Dict[str, Any]]]: """Runs the forward_inference pass. TODO(avnishn, kourosh): Review type hints for forward methods. @@ -273,7 +276,7 @@ def _forward_inference( @override(RLModule) def _forward_exploration( self, batch: MultiAgentBatch, **kwargs - ) -> Union[Mapping[str, Any], Dict[ModuleID, Mapping[str, Any]]]: + ) -> Union[Dict[str, Any], Dict[ModuleID, Dict[str, Any]]]: """Runs the forward_exploration pass. TODO(avnishn, kourosh): Review type hints for forward methods. @@ -290,7 +293,7 @@ def _forward_exploration( @override(RLModule) def get_state( self, module_ids: Optional[Set[ModuleID]] = None, inference_only: bool = False - ) -> Mapping[ModuleID, Any]: + ) -> Dict[ModuleID, Any]: """Returns the state of the multi-agent module. This method returns the state of each module specified by module_ids. If @@ -317,7 +320,7 @@ def get_state( } @override(RLModule) - def set_state(self, state_dict: Mapping[ModuleID, Any]) -> None: + def set_state(self, state_dict: Dict[ModuleID, Any]) -> None: """Sets the state of the multi-agent module. It is assumed that the state_dict is a mapping from module IDs to their @@ -331,7 +334,12 @@ def set_state(self, state_dict: Mapping[ModuleID, Any]) -> None: state_dict: The state dict to set. """ for module_id, state in state_dict.items(): - self._rl_modules[module_id].set_state(state) + if module_id in self: + self._rl_modules[module_id].set_state(state) + elif log_once("mid_in_state_but_not_in_marl_module"): + logger.warning( + f"ModuleID '{module_id}' found in `state`, but not in `self`!" + ) @override(RLModule) def save_state(self, path: Union[str, pathlib.Path]) -> None: @@ -413,7 +421,7 @@ def _run_forward_pass( forward_fn_name: str, batch: Union[NestedDict[Any], Dict[ModuleID, Any]], **kwargs, - ) -> Dict[ModuleID, Mapping[ModuleID, Any]]: + ) -> Dict[ModuleID, Dict[ModuleID, Any]]: """This is a helper method that runs the forward pass for the given module. It uses forward_fn_name to get the forward pass method from the RLModule @@ -638,7 +646,7 @@ def as_multi_agent(self) -> "MultiAgentRLModuleSpec": @ExperimentalAPI @dataclass class MultiAgentRLModuleConfig: - modules: Mapping[ModuleID, SingleAgentRLModuleSpec] = field(default_factory=dict) + modules: Dict[ModuleID, SingleAgentRLModuleSpec] = field(default_factory=dict) def to_dict(self): return { diff --git a/rllib/env/env_runner.py b/rllib/env/env_runner.py index a142128c56ae..dc0a95ab22d5 100644 --- a/rllib/env/env_runner.py +++ b/rllib/env/env_runner.py @@ -1,14 +1,18 @@ import abc -from typing import Any, Dict, TYPE_CHECKING +from typing import Any, Container, Dict, Optional, TYPE_CHECKING + +import tree # pip install dm_tree from ray.rllib.utils.actor_manager import FaultAwareApply from ray.rllib.utils.annotations import OldAPIStack from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.torch_utils import convert_to_torch_tensor +from ray.rllib.utils.typing import TensorType if TYPE_CHECKING: from ray.rllib.algorithms.algorithm_config import AlgorithmConfig -tf1, _, _ = try_import_tf() +tf1, tf, _ = try_import_tf() @OldAPIStack @@ -73,16 +77,30 @@ def sample(self, **kwargs) -> Any: The collected experience in any form. """ - def get_state(self) -> Dict[str, Any]: + def get_state( + self, + components: Optional[Container[str]] = None, + inference_only: bool = False, + ) -> Dict[str, Any]: """Returns this EnvRunner's (possibly serialized) current state as a dict. + Args: + components: An optional list of string keys to be included in the + returned state. This might be useful, if getting certain components + of the state is expensive (e.g. reading/compiling the weights of a large + NN) and at the same time, these components are not required by the + caller. + inference_only: Whether to return the inference-only weight set of the + underlying RLModule. Note that this setting only has an effect if + components is None or the string "rl_module" is in components. + Returns: - The current state of this EnvRunner. + The current state (or only the components specified) of this EnvRunner. """ # TODO (sven, simon): `Algorithm.save_checkpoint()` will store with - # this an empty worker state and in `Algorithm.from_checkpoint()` - # the empty state (not `None`) must be ensured separately. Shall we - # return here as a default `None`? + # this an empty worker state and in `Algorithm.from_checkpoint()` + # the empty state (not `None`) must be ensured separately. Shall we + # return here as a default `None`? return {} def set_state(self, state: Dict[str, Any]) -> None: @@ -113,3 +131,11 @@ def stop(self) -> None: def __del__(self) -> None: """If this Actor is deleted, clears all resources used by it.""" pass + + def _convert_to_tensor(self, struct) -> TensorType: + """Converts structs to a framework-specific tensor.""" + + if self.config.framework_str == "torch": + return convert_to_torch_tensor(struct) + else: + return tree.map_structure(tf.convert_to_tensor, struct) diff --git a/rllib/env/env_runner_group.py b/rllib/env/env_runner_group.py index 1808588e05af..86c0c24f7ea0 100644 --- a/rllib/env/env_runner_group.py +++ b/rllib/env/env_runner_group.py @@ -20,6 +20,7 @@ import ray from ray.actor import ActorHandle from ray.exceptions import RayActorError +from ray.rllib.core.learner import LearnerGroup from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.utils.actor_manager import RemoteCallResults @@ -52,7 +53,6 @@ if TYPE_CHECKING: from ray.rllib.algorithms.algorithm_config import AlgorithmConfig - from ray.rllib.core.learner import LearnerGroup tf1, tf, tfv = try_import_tf() @@ -385,9 +385,13 @@ def num_remote_worker_restarts(self) -> int: @DeveloperAPI def sync_env_runner_states( self, + *, config: "AlgorithmConfig", from_worker: Optional[EnvRunner] = None, env_steps_sampled: Optional[int] = None, + connector_states: Optional[List[Dict[str, Any]]] = None, + rl_module_state: Optional[Dict[str, Any]] = None, + env_runner_indices_to_update: Optional[List[int]] = None, ) -> None: """Synchronizes the connectors of this EnvRunnerGroup's EnvRunners. @@ -407,6 +411,9 @@ def sync_env_runner_states( env_steps_sampled: The total number of env steps taken thus far by all workers combined. Used to broadcast this number to all remote workers if `update_worker_filter_stats` is True in `config`. + env_runner_indices_to_update: The indices of those EnvRunners to update + with the merged state. Use None (default) to update all remote + EnvRunners. """ local_worker = self.local_worker() from_worker = from_worker or local_worker @@ -415,10 +422,20 @@ def sync_env_runner_states( # local worker is the only operating worker and thus of course always holds # the reference connector state. if self.num_healthy_remote_workers() == 0: - if env_steps_sampled: - self.local_worker().metrics.set_value( - NUM_ENV_STEPS_SAMPLED_LIFETIME, env_steps_sampled - ) + self.local_worker().set_state( + { + **( + {NUM_ENV_STEPS_SAMPLED_LIFETIME: env_steps_sampled} + if env_steps_sampled is not None + else {} + ), + **( + {"rl_module": rl_module_state} + if rl_module_state is not None + else {} + ), + } + ) return # Also early out, if we a) don't use the remote states AND b) don't want to @@ -427,73 +444,79 @@ def sync_env_runner_states( if not config.update_worker_filter_stats and not config.use_worker_filter_stats: return - env_runner_states = {} # Use states from all remote EnvRunners. if config.use_worker_filter_stats: - connector_states = self.foreach_worker( - lambda w: (w._env_to_module.get_state(), w._module_to_env.get_state()), - local_worker=False, - timeout_seconds=config.sync_filters_on_rollout_workers_timeout_s, - ) - env_to_module_states = [s[0] for s in connector_states] - module_to_env_states = [s[1] for s in connector_states] + if connector_states == []: + env_runner_states = {} + else: + if connector_states is None: + connector_states = self.foreach_worker( + lambda w: w.get_state( + components=[ + "env_to_module_connector", + "module_to_env_connector", + ] + ), + local_worker=False, + timeout_seconds=( + config.sync_filters_on_rollout_workers_timeout_s + ), + ) + env_to_module_states = [ + s["env_to_module_connector"] for s in connector_states + ] + module_to_env_states = [ + s["module_to_env_connector"] for s in connector_states + ] - env_runner_states["connector_states"] = { - "env_to_module_states": local_worker._env_to_module.merge_states( - env_to_module_states - ), - "module_to_env_states": local_worker._module_to_env.merge_states( - module_to_env_states - ), - } + env_runner_states = { + "env_to_module_connector": local_worker._env_to_module.merge_states( + env_to_module_states + ), + "module_to_env_connector": local_worker._module_to_env.merge_states( + module_to_env_states + ), + } # Ignore states from remote EnvRunners (use the current `from_worker` states # only). - elif hasattr(from_worker, "_env_to_module"): - env_runner_states["connector_states"] = { - "env_to_module_states": from_worker._env_to_module.get_state(), - "module_to_env_states": from_worker._module_to_env.get_state(), - } + else: + env_runner_states = from_worker.get_state( + components=["env_to_module_connector", "module_to_env_connector"] + ) # Update the global number of environment steps, if necessary. - if env_steps_sampled: - env_runner_states["env_steps_sampled"] = env_steps_sampled - - # Put the state dictionary into Ray's object store to avoid having to make n - # pickled copies of the state dict. - ref_env_runner_states = ray.put(env_runner_states) - - def _update(_env_runner: EnvRunner) -> Any: - env_runner_states = ray.get(ref_env_runner_states) - if hasattr(_env_runner, "_env_to_module"): - _env_runner._env_to_module.set_state( - env_runner_states["connector_states"]["env_to_module_states"] - ) - _env_runner._module_to_env.set_state( - env_runner_states["connector_states"]["module_to_env_states"] - ) - # Update the global number of environment steps for each worker. - if "env_steps_sampled" in env_runner_states: - # _env_runner.global_num_env_steps_sampled = - _env_runner.metrics.set_value( - NUM_ENV_STEPS_SAMPLED_LIFETIME, - env_runner_states["env_steps_sampled"], - ) - - # Broadcast updated states back to all workers (including the local one). - if config.update_worker_filter_stats: + if env_steps_sampled is not None: + env_runner_states[NUM_ENV_STEPS_SAMPLED_LIFETIME] = env_steps_sampled + + # Update the rl_module component of the EnvRunner states, if necessary: + if rl_module_state: + env_runner_states["rl_module"] = rl_module_state + + # If we do NOT want remote EnvRunners to get their Connector states updated, + # only update the local worker here (with all state components) and then remove + # the connector components. + if not config.update_worker_filter_stats: + local_worker.set_state(env_runner_states) + del env_runner_states["env_to_module_connector"] + del env_runner_states["module_to_env_connector"] + + # If there are components in the state left -> Update remote workers with these + # state components (and maybe the local worker, if it hasn't been updated yet). + if env_runner_states: + # Put the state dictionary into Ray's object store to avoid having to make n + # pickled copies of the state dict. + ref_env_runner_states = ray.put(env_runner_states) + + def _update(_env_runner: EnvRunner) -> None: + _env_runner.set_state(ray.get(ref_env_runner_states)) + + # Broadcast updated states back to all workers. self.foreach_worker( _update, - local_worker=True, - timeout_seconds=config.sync_filters_on_rollout_workers_timeout_s, + remote_worker_ids=env_runner_indices_to_update, + local_worker=config.update_worker_filter_stats, + timeout_seconds=0.0, # This is a state update -> Fire-and-forget. ) - # Update only the local_worker. Why don't we use `from_worker` here (assuming - # it's different from the local worker)? B/c we want to use this utility as - # a means to update the local worker of EnvRunnerGroup A from another - # EnvRunnerGroup B (for example synching eval EnvRunners from training - # EnvRunners). In other words, if `from_worker` != local worker, - # `from_worker`'s state will not be altered by this method, no matter what. - else: - _update(self.local_worker()) @DeveloperAPI def sync_weights( @@ -521,9 +544,10 @@ def sync_weights( global_vars: An optional global vars dict to set this worker to. If None, do not update the global_vars. timeout_seconds: Timeout in seconds to wait for the sync weights - calls to complete. Default is 0 (sync-and-forget, do not wait - for any sync calls to finish). This significantly improves - algorithm performance. + calls to complete. Default is 0.0 (fire-and-forget, do not wait + for any sync calls to finish). Setting this to 0.0 might significantly + improve algorithm performance, depending on the algo's `training_step` + logic. inference_only: Synch weights with workers that keep inference-only modules. This is needed for algorithms in the new stack that use inference-only modules. In this case only a part of the @@ -545,14 +569,34 @@ def sync_weights( "`from_worker_or_trainer` is None. In this case, EnvRunnerGroup " "should have local_env_runner. But local_env_runner is also None." ) - weights = weights_src.get_weights(policies, inference_only) + if self._remote_config.enable_env_runner_and_connector_v2: + weights = weights_src.get_state( + components="rl_module", + inference_only=inference_only, + module_ids=policies, + ) + if isinstance(weights_src, LearnerGroup): + weights = weights["learner_state"]["rl_module"] + else: + weights = weights["rl_module"] + else: + weights = weights_src.get_weights(policies, inference_only) + # Move weights to the object store to avoid having to make n pickled copies # of the weights dict for each worker. weights_ref = ray.put(weights) - def _set_weights(env_runner): - _weights = ray.get(weights_ref) - env_runner.set_weights(_weights, global_vars) + if self._remote_config.enable_env_runner_and_connector_v2: + + def _set_weights(env_runner): + _weights = ray.get(weights_ref) + env_runner.set_state({"rl_module": _weights}) + + else: + + def _set_weights(env_runner): + _weights = ray.get(weights_ref) + env_runner.set_weights(_weights, global_vars) # Sync to specified remote workers in this EnvRunnerGroup. self.foreach_worker( @@ -811,8 +855,11 @@ def foreach_worker( func: The function to call for each worker (as only arg). local_worker: Whether apply `func` on local worker too. Default is True. healthy_only: Apply `func` on known-to-be healthy workers only. - remote_worker_ids: Apply `func` on a selected set of remote workers. - timeout_seconds: Time to wait for results. Default is None. + remote_worker_ids: Apply `func` on a selected set of remote workers. Use + None (default) for all remote EnvRunners. + timeout_seconds: Time to wait (in seconds) for results. Set this to 0.0 for + fire-and-forget. Set this to None (default) to wait infinitely (i.e. for + synchronous execution). return_obj_refs: whether to return ObjectRef instead of actual results. Note, for fault tolerance reasons, these returned ObjectRefs should never be resolved with ray.get() outside of this WorkerSet. @@ -862,8 +909,7 @@ def foreach_worker_with_id( func: Callable[[int, EnvRunner], T], *, local_worker: bool = True, - # TODO(jungong) : switch to True once Algorithm is migrated. - healthy_only: bool = False, + healthy_only: bool = True, remote_worker_ids: List[int] = None, timeout_seconds: Optional[float] = None, ) -> List[T]: @@ -908,8 +954,7 @@ def foreach_worker_async( self, func: Callable[[EnvRunner], T], *, - # TODO(jungong) : switch to True once Algorithm is migrated. - healthy_only: bool = False, + healthy_only: bool = True, remote_worker_ids: List[int] = None, ) -> int: """Calls the given function asynchronously with each worker as the argument. @@ -924,7 +969,11 @@ def foreach_worker_async( remote_worker_ids: Apply `func` on a selected set of remote workers. Returns: - The number of async requests that are currently in-flight. + The number of async requests that have actually been made. This is the + length of `remote_worker_ids` (or self.num_remote_workers()` if + `remote_worker_ids` is None) minus the number of requests that were NOT + made b/c a remote worker already had its + `max_remote_requests_in_flight_per_actor` counter reached. """ return self._worker_manager.foreach_actor_async( func, diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py index 1300c8fd34b9..98d08bf6c603 100644 --- a/rllib/env/multi_agent_env_runner.py +++ b/rllib/env/multi_agent_env_runner.py @@ -1,20 +1,22 @@ -import gymnasium as gym -import logging - from collections import defaultdict from functools import partial -from typing import DefaultDict, Dict, List, Optional +import logging +from typing import Any, Container, DefaultDict, Dict, List, Optional + +import gymnasium as gym from ray.rllib.algorithms.algorithm_config import AlgorithmConfig from ray.rllib.algorithms.callbacks import DefaultCallbacks from ray.rllib.core.columns import Columns -from ray.rllib.core.rl_module.marl_module import ModuleID, MultiAgentRLModuleSpec +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec from ray.rllib.env.env_context import EnvContext from ray.rllib.env.env_runner import EnvRunner from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.env.multi_agent_episode import MultiAgentEpisode from ray.rllib.env.utils import _gym_env_creator +from ray.rllib.utils import force_list from ray.rllib.utils.annotations import override +from ray.rllib.utils.deprecation import Deprecated from ray.rllib.utils.metrics import ( EPISODE_DURATION_SEC_MEAN, EPISODE_LEN_MAX, @@ -30,6 +32,7 @@ NUM_EPISODES, NUM_MODULE_STEPS_SAMPLED, NUM_MODULE_STEPS_SAMPLED_LIFETIME, + WEIGHTS_SEQ_NO, ) from ray.rllib.utils.metrics.metrics_logger import MetricsLogger from ray.rllib.utils.pre_checks.env import check_multiagent_environments @@ -583,6 +586,10 @@ def get_metrics(self) -> ResultDict: for eps in self._done_episodes_for_metrics: assert eps.is_done episode_length = len(eps) + agent_steps = defaultdict( + int, + {str(aid): len(sa_eps) for aid, sa_eps in eps.agent_episodes.items()}, + ) episode_return = eps.get_return() episode_duration_s = eps.get_duration_s() @@ -608,10 +615,13 @@ def get_metrics(self) -> ResultDict: episode_length += len(eps2) episode_return += return_eps2 episode_duration_s += eps2.get_duration_s() + for sa_eps in eps2.agent_episodes.values(): return_sa = sa_eps.get_return() + agent_steps[str(sa_eps.agent_id)] += len(sa_eps) agent_episode_returns[str(sa_eps.agent_id)] += return_sa module_episode_returns[sa_eps.module_id] += return_sa + del self._ongoing_episodes_for_metrics[eps.id_] self._log_episode_metrics( @@ -620,6 +630,7 @@ def get_metrics(self) -> ResultDict: episode_duration_s, agent_episode_returns, module_episode_returns, + dict(agent_steps), ) # Log num episodes counter for this iteration. @@ -636,58 +647,67 @@ def get_metrics(self) -> ResultDict: # Return reduced metrics. return self.metrics.reduce() - # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this - # API. Replace by proper state overriding via `EnvRunner.set_state()` - def set_weights( + @override(EnvRunner) + def get_state( self, - weights: Dict[ModuleID, ModelWeights], - global_vars: Optional[Dict] = None, - weights_seq_no: int = 0, - ) -> None: - """Writes the weights of our multi-agent `RLModule` - - Args: - weigths: A dictionary mapping `ModuleID`s to the new weigths to - be used in the `MultiAgentRLModule` stored in this instance. - global_vars: An optional global vars dictionary to set this - worker to. If None, do not update the global_vars. - weights_seq_no: If needed, a sequence number for the weights version - can be passed into this method. If not None, will store this seq no - (in self.weights_seq_no) and in future calls - if the seq no did not - change wrt. the last call - will ignore the call to save on performance. - - .. testcode:: - :skipif: True - - from ray.rllib.env import MultiAgentEnvRunner - # Create an `MultiAgentEnvRunner`. - worker = ... - weights = worker.get_weights() - # Set `global_vars` (timestep) as well. - worker.set_weights(weights, {"timestep": 42}) - """ - # Only update the weigths, if this is the first synchronization or - # if the weights of this `EnvRunner` lacks behind the actual ones. - if weights_seq_no == 0 or self._weights_seq_no < weights_seq_no: - self.module.set_state(weights) - - def get_weights( - self, modules=None, inference_only: bool = False - ) -> Dict[ModuleID, ModelWeights]: - """Returns the weights of our multi-agent `RLModule`. - - Args: - modules: `ModuleID`s for which to return the weights. If `None` - weigths for all modules are returned. See for details - `MultiAgentRLModule.get_state()`. - inference_only: If True, will return only a specified subset of the - weights (e.g. only the weights needed for inference). + components: Optional[Container[str]] = None, + *, + inference_only: bool = True, + module_ids=None, + ) -> Dict[str, Any]: + components = force_list( + components + if components is not None + else ["rl_module", "env_to_module_connector", "module_to_env_connector"] + ) + state = { + WEIGHTS_SEQ_NO: self._weights_seq_no, + NUM_ENV_STEPS_SAMPLED_LIFETIME: ( + self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0) + ), + } + if "rl_module" in components: + state["rl_module"] = self.module.get_state( + inference_only=inference_only, module_ids=module_ids + ) + if "env_to_module_connector" in components: + state["env_to_module_connector"] = self._env_to_module.get_state() + if "module_to_env_connector" in components: + state["module_to_env_connector"] = self._module_to_env.get_state() - Returns: - A dictionary mapping `ModuleID`s to their corresponding weights. - """ + return state - return self.module.get_state(module_ids=modules, inference_only=inference_only) + @override(EnvRunner) + def set_state(self, state: Dict[str, Any]) -> None: + if "env_to_module_connector" in state: + self._env_to_module.set_state(state["env_to_module_connector"]) + if "module_to_env_connector" in state: + self._module_to_env.set_state(state["module_to_env_connector"]) + + # Update the RLModule state. + if "rl_module" in state: + # A missing value for WEIGHTS_SEQ_NO or a value of 0 means: Force the + # update. + weights_seq_no = state.get(WEIGHTS_SEQ_NO, 0) + + # Only update the weigths, if this is the first synchronization or + # if the weights of this `EnvRunner` lacks behind the actual ones. + if weights_seq_no == 0 or self._weights_seq_no < weights_seq_no: + weights = state["rl_module"] + weights = self._convert_to_tensor(weights) + self.module.set_state(weights) + + # Update our weights_seq_no, if the new one is > 0. + if weights_seq_no > 0: + self._weights_seq_no = weights_seq_no + + # Update our lifetime counters. + if NUM_ENV_STEPS_SAMPLED_LIFETIME in state: + self.metrics.set_value( + key=NUM_ENV_STEPS_SAMPLED_LIFETIME, + value=state[NUM_ENV_STEPS_SAMPLED_LIFETIME], + reduce="sum", + ) @override(EnvRunner) def assert_healthy(self): @@ -865,7 +885,15 @@ def _increase_sampled_metrics(self, num_steps, next_obs, episode): ) return num_steps - def _log_episode_metrics(self, length, ret, sec, agents=None, modules=None): + def _log_episode_metrics( + self, + length, + ret, + sec, + agents=None, + modules=None, + agent_steps=None, + ): # Log general episode metrics. self.metrics.log_dict( { @@ -878,6 +906,7 @@ def _log_episode_metrics(self, length, ret, sec, agents=None, modules=None): "agent_episode_returns_mean": agents, # Per-RLModule returns. "module_episode_returns_mean": modules, + "agent_steps": agent_steps, } if agents is not None else {} @@ -904,3 +933,25 @@ def _log_episode_metrics(self, length, ret, sec, agents=None, modules=None): reduce="max", window=self.config.metrics_num_episodes_for_smoothing, ) + + @Deprecated( + new="MultiAgentEnvRunner.get_state(components='rl_module')", + error=False, + ) + def get_weights(self, modules=None): + return self.get_state(components="rl_module")["rl_module"] + + @Deprecated(new="MultiAgentEnvRunner.set_state()", error=False) + def set_weights( + self, + weights: ModelWeights, + global_vars: Optional[Dict] = None, + weights_seq_no: int = 0, + ) -> None: + assert global_vars is None + return self.set_state( + { + "rl_module": weights, + WEIGHTS_SEQ_NO: weights_seq_no, + } + ) diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py index 582806dd193b..773f8ea16675 100644 --- a/rllib/env/single_agent_env_runner.py +++ b/rllib/env/single_agent_env_runner.py @@ -1,10 +1,10 @@ -import gymnasium as gym -import logging -import tree - +import time from collections import defaultdict from functools import partial -from typing import DefaultDict, Dict, List, Optional +import logging +from typing import Any, Container, DefaultDict, Dict, List, Optional + +import gymnasium as gym from ray.rllib.algorithms.algorithm_config import AlgorithmConfig from ray.rllib.algorithms.callbacks import DefaultCallbacks @@ -16,7 +16,9 @@ from ray.rllib.env.env_runner import EnvRunner from ray.rllib.env.single_agent_episode import SingleAgentEpisode from ray.rllib.env.utils import _gym_env_creator +from ray.rllib.utils import force_list from ray.rllib.utils.annotations import override +from ray.rllib.utils.deprecation import Deprecated from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.metrics import ( EPISODE_DURATION_SEC_MEAN, @@ -33,11 +35,13 @@ NUM_EPISODES, NUM_MODULE_STEPS_SAMPLED, NUM_MODULE_STEPS_SAMPLED_LIFETIME, + SAMPLE_TIMER, + TIME_BETWEEN_SAMPLING, + WEIGHTS_SEQ_NO, ) from ray.rllib.utils.metrics.metrics_logger import MetricsLogger from ray.rllib.utils.spaces.space_utils import unbatch -from ray.rllib.utils.torch_utils import convert_to_torch_tensor -from ray.rllib.utils.typing import EpisodeID, ModelWeights, ResultDict, TensorType +from ray.rllib.utils.typing import EpisodeID, ModelWeights, ResultDict from ray.tune.registry import ENV_CREATOR, _global_registry from ray.util.annotations import PublicAPI @@ -91,7 +95,7 @@ def __init__(self, config: AlgorithmConfig, **kwargs): try: module_spec: SingleAgentRLModuleSpec = self.config.rl_module_spec module_spec.observation_space = self._env_to_module.observation_space - module_spec.action_space = self.env.envs[0].action_space + module_spec.action_space = self.env.single_action_space if module_spec.model_config_dict is None: module_spec.model_config_dict = self.config.model_config # Only load a light version of the module, if available. This is useful @@ -120,6 +124,8 @@ def __init__(self, config: AlgorithmConfig, **kwargs): ] = defaultdict(list) self._weights_seq_no: int = 0 + self._time_after_sampling = None + @override(EnvRunner) def sample( self, @@ -156,55 +162,64 @@ def sample( """ assert not (num_timesteps is not None and num_episodes is not None) - # If no execution details are provided, use the config to try to infer the - # desired timesteps/episodes to sample and exploration behavior. - if explore is None: - explore = self.config.explore - if ( - num_timesteps is None - and num_episodes is None - and self.config.batch_mode == "truncate_episodes" - ): - num_timesteps = ( - self.config.get_rollout_fragment_length(worker_index=self.worker_index) - * self.num_envs + if self._time_after_sampling is not None: + self.metrics.log_value( + key=TIME_BETWEEN_SAMPLING, + value=time.perf_counter() - self._time_after_sampling, ) - # Sample n timesteps. - if num_timesteps is not None: - samples = self._sample_timesteps( - num_timesteps=num_timesteps, - explore=explore, - random_actions=random_actions, - force_reset=force_reset, - ) - # Sample m episodes. - elif num_episodes is not None: - samples = self._sample_episodes( - num_episodes=num_episodes, - explore=explore, - random_actions=random_actions, - ) - # For complete episodes mode, sample as long as the number of timesteps - # done is smaller than the `train_batch_size`. - else: - total = 0 - samples = [] - while total < self.config.train_batch_size: - episodes = self._sample_episodes( - num_episodes=self.num_envs, + with self.metrics.log_time(SAMPLE_TIMER): + # If no execution details are provided, use the config to try to infer the + # desired timesteps/episodes to sample and exploration behavior. + if explore is None: + explore = self.config.explore + if ( + num_timesteps is None + and num_episodes is None + and self.config.batch_mode == "truncate_episodes" + ): + num_timesteps = ( + self.config.get_rollout_fragment_length(self.worker_index) + * self.num_envs + ) + + # Sample n timesteps. + if num_timesteps is not None: + samples = self._sample_timesteps( + num_timesteps=num_timesteps, explore=explore, random_actions=random_actions, + force_reset=force_reset, ) - total += sum(len(e) for e in episodes) - samples.extend(episodes) + # Sample m episodes. + elif num_episodes is not None: + samples = self._sample_episodes( + num_episodes=num_episodes, + explore=explore, + random_actions=random_actions, + ) + # For complete episodes mode, sample as long as the number of timesteps + # done is smaller than the `train_batch_size`. + else: + total = 0 + samples = [] + while total < self.config.train_batch_size: + episodes = self._sample_episodes( + num_episodes=self.num_envs, + explore=explore, + random_actions=random_actions, + ) + total += sum(len(e) for e in episodes) + samples.extend(episodes) + + # Make the `on_sample_end` callback. + self._callbacks.on_sample_end( + env_runner=self, + metrics_logger=self.metrics, + samples=samples, + ) - # Make the `on_sample_end` callback. - self._callbacks.on_sample_end( - env_runner=self, - metrics_logger=self.metrics, - samples=samples, - ) + self._time_after_sampling = time.perf_counter() return samples @@ -278,7 +293,8 @@ def _sample_timesteps( # RLModule forward pass: Explore or not. if explore: env_steps_lifetime = ( - self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME) + ts + self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0) + + ts ) to_env = self.module.forward_exploration( to_module, t=env_steps_lifetime @@ -474,7 +490,8 @@ def _sample_episodes( # RLModule forward pass: Explore or not. if explore: env_steps_lifetime = ( - self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME) + ts + self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0) + + ts ) to_env = self.module.forward_exploration( to_module, t=env_steps_lifetime @@ -623,40 +640,66 @@ def get_metrics(self) -> ResultDict: # Return reduced metrics. return self.metrics.reduce() - # TODO (sven): Remove the requirement for EnvRunners/RolloutWorkers to have this - # API. Replace by proper state overriding via `EnvRunner.set_state()` - def set_weights( + @override(EnvRunner) + def get_state( self, - weights: ModelWeights, - global_vars: Optional[Dict] = None, - weights_seq_no: int = 0, - ) -> None: - """Writes the weights of our (single-agent) RLModule. - - Args: - weigths: A dictionary mapping `ModuleID`s to the new weigths to - be used in the `MultiAgentRLModule` stored in this instance. - global_vars: An optional global vars dictionary to set this - worker to. If None, do not update the global_vars. - weights_seq_no: If needed, a sequence number for the weights version - can be passed into this method. If not None, will store this seq no - (in self.weights_seq_no) and in future calls - if the seq no did not - change wrt. the last call - will ignore the call to save on performance. - - """ - - # Only update the weigths, if this is the first synchronization or - # if the weights of this `EnvRunner` lacks behind the actual ones. - if weights_seq_no == 0 or self._weights_seq_no < weights_seq_no: - if isinstance(weights, dict) and DEFAULT_MODULE_ID in weights: - weights = weights[DEFAULT_MODULE_ID] - weights = self._convert_to_tensor(weights) - self.module.set_state(weights) - - def get_weights(self, modules=None, inference_only: bool = False): - """Returns the weights of our (single-agent) RLModule.""" + components: Optional[Container[str]] = None, + *, + inference_only: bool = True, + module_ids=None, + ) -> Dict[str, Any]: + components = force_list( + components + if components is not None + else ["rl_module", "env_to_module_connector", "module_to_env_connector"] + ) + state = { + WEIGHTS_SEQ_NO: self._weights_seq_no, + NUM_ENV_STEPS_SAMPLED_LIFETIME: ( + self.metrics.peek(NUM_ENV_STEPS_SAMPLED_LIFETIME, default=0) + ), + } + if "rl_module" in components: + state["rl_module"] = self.module.get_state(inference_only=inference_only) + if "env_to_module_connector" in components: + state["env_to_module_connector"] = self._env_to_module.get_state() + if "module_to_env_connector" in components: + state["module_to_env_connector"] = self._module_to_env.get_state() + + return state - return self.module.get_state(inference_only=inference_only) + @override(EnvRunner) + def set_state(self, state: Dict[str, Any]) -> None: + if "env_to_module_connector" in state: + self._env_to_module.set_state(state["env_to_module_connector"]) + if "module_to_env_connector" in state: + self._module_to_env.set_state(state["module_to_env_connector"]) + + # Update the RLModule state. + if "rl_module" in state: + # A missing value for WEIGHTS_SEQ_NO or a value of 0 means: Force the + # update. + weights_seq_no = state.get(WEIGHTS_SEQ_NO, 0) + + # Only update the weigths, if this is the first synchronization or + # if the weights of this `EnvRunner` lacks behind the actual ones. + if weights_seq_no == 0 or self._weights_seq_no < weights_seq_no: + weights = state["rl_module"] + if isinstance(weights, dict) and DEFAULT_MODULE_ID in weights: + weights = weights[DEFAULT_MODULE_ID] + weights = self._convert_to_tensor(weights) + self.module.set_state(weights) + # Update our weights_seq_no, if the new one is > 0. + if weights_seq_no > 0: + self._weights_seq_no = weights_seq_no + + # Update our lifetime counters. + if NUM_ENV_STEPS_SAMPLED_LIFETIME in state: + self.metrics.set_value( + key=NUM_ENV_STEPS_SAMPLED_LIFETIME, + value=state[NUM_ENV_STEPS_SAMPLED_LIFETIME], + reduce="sum", + ) @override(EnvRunner) def assert_healthy(self): @@ -760,14 +803,6 @@ def _make_on_episode_callback(self, which: str, idx: int, episodes=None): env_index=idx, ) - def _convert_to_tensor(self, struct) -> TensorType: - """Converts structs to a framework-specific tensor.""" - - if self.config.framework_str == "torch": - return convert_to_torch_tensor(struct) - else: - return tree.map_structure(tf.convert_to_tensor, struct) - def _increase_sampled_metrics(self, num_steps): # Per sample cycle stats. self.metrics.log_value( @@ -821,3 +856,25 @@ def _log_episode_metrics(self, length, ret, sec): self.metrics.log_value(EPISODE_RETURN_MIN, ret, reduce="min", window=win) self.metrics.log_value(EPISODE_LEN_MAX, length, reduce="max", window=win) self.metrics.log_value(EPISODE_RETURN_MAX, ret, reduce="max", window=win) + + @Deprecated( + new="SingleAgentEnvRunner.get_state(components='rl_module')", + error=False, + ) + def get_weights(self, modules=None): + return self.get_state(components="rl_module")["rl_module"] + + @Deprecated(new="SingleAgentEnvRunner.set_state()", error=False) + def set_weights( + self, + weights: ModelWeights, + global_vars: Optional[Dict] = None, + weights_seq_no: int = 0, + ) -> None: + assert global_vars is None + return self.set_state( + { + "rl_module": weights, + WEIGHTS_SEQ_NO: weights_seq_no, + } + ) diff --git a/rllib/env/tests/test_single_agent_env_runner.py b/rllib/env/tests/test_single_agent_env_runner.py index 83c7bf083c22..d6dbf7082985 100644 --- a/rllib/env/tests/test_single_agent_env_runner.py +++ b/rllib/env/tests/test_single_agent_env_runner.py @@ -1,8 +1,14 @@ +from functools import partial import unittest +import gymnasium as gym + import ray +from ray import tune from ray.rllib.algorithms.algorithm_config import AlgorithmConfig from ray.rllib.env.single_agent_env_runner import SingleAgentEnvRunner +from ray.rllib.env.utils import _gym_env_creator +from ray.rllib.examples.envs.classes.simple_corridor import SimpleCorridor class TestSingleAgentEnvRunner(unittest.TestCase): @@ -10,6 +16,20 @@ class TestSingleAgentEnvRunner(unittest.TestCase): def setUpClass(cls) -> None: ray.init() + tune.register_env( + "tune-registered", + lambda cfg: SimpleCorridor({"corridor_length": 10}), + ) + + gym.register( + "TestEnv-v0", + partial( + _gym_env_creator, + env_context={"corridor_length": 10}, + env_descriptor=SimpleCorridor, + ), + ) + @classmethod def tearDownClass(cls) -> None: ray.shutdown() @@ -51,40 +71,68 @@ def test_sample(self): # 2 (num_env_per_worker) * 64 (rollout_fragment_length). self.assertTrue(sum(len(e) for e in episodes) == 128) - def test_distributed_env_runner(self): - """Tests, whether SingleAgentGymEnvRunner can be distributed.""" - - remote_class = ray.remote(num_cpus=1, num_gpus=0)(SingleAgentEnvRunner) - - # Test with both parallelized sub-envs and w/o. - remote_worker_envs = [False, True] + def test_async_vector_env(self): + """Tests, whether SingleAgentGymEnvRunner can run with vector envs.""" - for envs_parallel in remote_worker_envs: + for env in ["TestEnv-v0", "CartPole-v1", SimpleCorridor, "tune-registered"]: config = ( - AlgorithmConfig().environment("CartPole-v1") - # Vectorize x2 and by default, rollout 64 timesteps per individual env. + AlgorithmConfig().environment(env) + # Vectorize x5 and by default, rollout 64 timesteps per individual env. .env_runners( - num_env_runners=5, + num_env_runners=0, num_envs_per_env_runner=5, rollout_fragment_length=10, - remote_worker_envs=envs_parallel, + remote_worker_envs=True, ) ) - array = [ - remote_class.remote(config=config) - for _ in range(config.num_env_runners) - ] - # Sample in parallel. - results = [a.sample.remote(random_actions=True) for a in array] - results = ray.get(results) - # Loop over individual EnvRunner Actor's results and inspect each. - for episodes in results: - # Assert length of all fragments is `rollout_fragment_length`. - self.assertEqual( - sum(len(e) for e in episodes), - config.num_envs_per_env_runner * config.rollout_fragment_length, + env_runner = SingleAgentEnvRunner(config=config) + + # Sample with the async-vectorized env. + episodes = env_runner.sample(random_actions=True) + # Assert length of all fragments is `rollout_fragment_length`. + self.assertEqual( + sum(len(e) for e in episodes), + config.num_envs_per_env_runner * config.rollout_fragment_length, + ) + env_runner.stop() + + def test_distributed_env_runner(self): + """Tests, whether SingleAgentGymEnvRunner can be distributed.""" + + remote_class = ray.remote(num_cpus=1, num_gpus=0)(SingleAgentEnvRunner) + + # Test with both parallelized sub-envs and w/o. + async_vectorization_mode = [False, True] + + for async_ in async_vectorization_mode: + + for env_spec in ["tune-registered", "CartPole-v1", SimpleCorridor]: + config = ( + AlgorithmConfig().environment(env_spec) + # Vectorize x5 and by default, rollout 64 timesteps per individual + # env. + .env_runners( + num_env_runners=5, + num_envs_per_env_runner=5, + rollout_fragment_length=10, + remote_worker_envs=async_, + ) ) + array = [ + remote_class.remote(config=config) + for _ in range(config.num_env_runners) + ] + # Sample in parallel. + results = [a.sample.remote(random_actions=True) for a in array] + results = ray.get(results) + # Loop over individual EnvRunner Actor's results and inspect each. + for episodes in results: + # Assert length of all fragments is `rollout_fragment_length`. + self.assertEqual( + sum(len(e) for e in episodes), + config.num_envs_per_env_runner * config.rollout_fragment_length, + ) if __name__ == "__main__": diff --git a/rllib/env/utils/infinite_lookback_buffer.py b/rllib/env/utils/infinite_lookback_buffer.py index b1264305a3a6..998f680736e6 100644 --- a/rllib/env/utils/infinite_lookback_buffer.py +++ b/rllib/env/utils/infinite_lookback_buffer.py @@ -46,6 +46,9 @@ def append(self, item) -> None: def extend(self, items) -> None: """Appends all items in `items` to the end of this buffer.""" if self.finalized: + # TODO (sven): When extending with a list of structs, we should + # probably rather do: `tree.map_structure(..., self.data, + # tree.map_structure(lambda *s: np.array(*s), *items)`)?? self.data = tree.map_structure( lambda d, i: np.concatenate([d, i], axis=0), self.data, np.array(items) ) diff --git a/rllib/evaluation/postprocessing_v2.py b/rllib/evaluation/postprocessing_v2.py deleted file mode 100644 index 2ee6aed77c8c..000000000000 --- a/rllib/evaluation/postprocessing_v2.py +++ /dev/null @@ -1,38 +0,0 @@ -import numpy as np - -from ray.rllib.utils.annotations import OldAPIStack - - -@OldAPIStack -def compute_value_targets( - values, - rewards, - terminateds, - truncateds, - gamma: float, - lambda_: float, -): - """Computes value function (vf) targets given vf predictions and rewards. - - Note that advantages can then easily be computeed via the formula: - advantages = targets - vf_predictions - """ - # Force-set all values at terminals (not at truncations!) to 0.0. - orig_values = flat_values = values * (1.0 - terminateds) - - flat_values = np.append(flat_values, 0.0) - intermediates = rewards + gamma * (1 - lambda_) * flat_values[1:] - continues = 1.0 - terminateds - - Rs = [] - last = flat_values[-1] - for t in reversed(range(intermediates.shape[0])): - last = intermediates[t] + continues[t] * gamma * lambda_ * last - Rs.append(last) - if truncateds[t]: - last = orig_values[t] - - # Reverse back to correct (time) direction. - value_targets = np.stack(list(reversed(Rs)), axis=0) - - return value_targets.astype(np.float32) diff --git a/rllib/examples/connectors/flatten_observations_dict_space.py b/rllib/examples/connectors/flatten_observations_dict_space.py index bed31ce5ac28..6958c9c27cd2 100644 --- a/rllib/examples/connectors/flatten_observations_dict_space.py +++ b/rllib/examples/connectors/flatten_observations_dict_space.py @@ -64,12 +64,12 @@ |---------------------+------------+----------------+--------+------------------+ | PPO_env_a2fd6_00000 | TERMINATED | 127.0.0.1:7409 | 25 | 24.1426 | +---------------------+------------+----------------+--------+------------------+ -------------------------+------------------------+------------------------+ - num_env_steps_sample | num_env_steps_traine | episode_return_mean | - d_lifetime | d_lifetime | | -------------------------+------------------------+------------------------| - 100000 | 100000 | 421.42 | -------------------------+------------------------+------------------------+ ++------------------------+------------------------+------------------------+ +| num_env_steps_sample | num_env_steps_traine | episode_return_mean | +| d_lifetime | d_lifetime | | ++------------------------+------------------------+------------------------| +| 100000 | 100000 | 421.42 | ++------------------------+------------------------+------------------------+ """ from ray.tune.registry import register_env from ray.rllib.connectors.env_to_module import FlattenObservations @@ -113,7 +113,7 @@ def _env_to_module_pipeline(env): register_env("env", lambda _: CartPoleWithDictObservationSpace()) # Define the AlgorithmConfig used. - config = ( + base_config = ( get_trainable_cls(args.algo) .get_default_config() .environment("env") @@ -122,9 +122,7 @@ def _env_to_module_pipeline(env): gamma=0.99, lr=0.0003, ) - ) - if args.enable_new_api_stack: - config = config.rl_module( + .rl_module( model_config_dict={ "fcnet_hiddens": [32], "fcnet_activation": "linear", @@ -132,26 +130,34 @@ def _env_to_module_pipeline(env): "uses_new_env_runners": True, }, ) - else: - config = config.training( - model=dict( - fcnet_hiddens=[32], fcnet_activation="linear", vf_share_layers=True - ) - ) + ) # Add a simple multi-agent setup. if args.num_agents > 0: - config = config.multi_agent( + base_config.multi_agent( policies={f"p{i}" for i in range(args.num_agents)}, policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", ) - # Fix some PPO-specific settings. + # PPO-specific settings (for better learning behavior only). if args.algo == "PPO": - config = config.training( + base_config.training( num_sgd_iter=6, vf_loss_coeff=0.01, ) + # IMPALA-specific settings (for better learning behavior only). + elif args.algo == "IMPALA": + base_config.training( + lr=0.0005, + vf_loss_coeff=0.05, + entropy_coeff=0.0, + ) + base_config.rl_module( + model_config_dict={ + "vf_share_layers": True, + "uses_new_env_runners": True, + } + ) # Run everything as configured. - run_rllib_example_script_experiment(config, args) + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/connectors/frame_stacking.py b/rllib/examples/connectors/frame_stacking.py index e26918796ff4..52a4f4c352b1 100644 --- a/rllib/examples/connectors/frame_stacking.py +++ b/rllib/examples/connectors/frame_stacking.py @@ -190,12 +190,7 @@ def _env_creator(cfg): learner_connector=( None if args.use_gym_wrapper_framestacking else _make_learner_connector ), - lambda_=0.95, - kl_coeff=0.5, - clip_param=0.1, - vf_clip_param=10.0, entropy_coeff=0.01, - num_sgd_iter=10, # Linearly adjust learning rate based on number of GPUs. lr=0.00015 * (args.num_gpus or 1), grad_clip=100.0, @@ -214,6 +209,17 @@ def _env_creator(cfg): ) ) + # PPO specific settings. + if args.algo == "PPO": + base_config.training( + num_sgd_iter=10, + mini_batch_size_per_learner=64, + lambda_=0.95, + kl_coeff=0.5, + clip_param=0.1, + vf_clip_param=10.0, + ) + # Add a simple multi-agent setup. if args.num_agents > 0: base_config.multi_agent( diff --git a/rllib/examples/connectors/mean_std_filtering.py b/rllib/examples/connectors/mean_std_filtering.py index 470812585138..2fec8f3c63d0 100644 --- a/rllib/examples/connectors/mean_std_filtering.py +++ b/rllib/examples/connectors/mean_std_filtering.py @@ -122,7 +122,7 @@ def observation(self, observation): else: register_env("lopsided-pend", lambda _: LopsidedObs(gym.make("Pendulum-v1"))) - config = ( + base_config = ( get_trainable_cls(args.algo) .get_default_config() .environment("lopsided-pend") @@ -144,12 +144,9 @@ def observation(self, observation): ) .training( train_batch_size_per_learner=512, - mini_batch_size_per_learner=64, gamma=0.95, # Linearly adjust learning rate based on number of GPUs. lr=0.0003 * (args.num_gpus or 1), - lambda_=0.1, - vf_clip_param=10.0, vf_loss_coeff=0.01, ) .rl_module( @@ -183,11 +180,19 @@ def observation(self, observation): # ) ) + # PPO specific settings. + if args.algo == "PPO": + base_config.training( + mini_batch_size_per_learner=64, + lambda_=0.1, + vf_clip_param=10.0, + ) + # Add a simple multi-agent setup. if args.num_agents > 0: - config = config.multi_agent( + base_config.multi_agent( policies={f"p{i}" for i in range(args.num_agents)}, policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", ) - run_rllib_example_script_experiment(config, args) + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/envs/custom_gym_env.py b/rllib/examples/envs/custom_gym_env.py index 616e3cee15b9..01fa5ecc452f 100644 --- a/rllib/examples/envs/custom_gym_env.py +++ b/rllib/examples/envs/custom_gym_env.py @@ -100,7 +100,7 @@ def reset(self, *, seed=None, options=None): random.seed(seed) self.cur_pos = 0 # Return obs and (empty) info dict. - return np.array([self.cur_pos], np.float32), {} + return np.array([self.cur_pos], np.float32), {"env_state": "reset"} def step(self, action): assert action in [0, 1], action diff --git a/rllib/examples/inference/policy_inference_after_training_w_connector.py b/rllib/examples/inference/policy_inference_after_training_w_connector.py index e4a66ec33266..10a6587c313c 100644 --- a/rllib/examples/inference/policy_inference_after_training_w_connector.py +++ b/rllib/examples/inference/policy_inference_after_training_w_connector.py @@ -178,7 +178,7 @@ def _env_creator(cfg): connectors=[ AddObservationsFromEpisodesToBatch(), AddStatesFromEpisodesToBatch(), - BatchIndividualItems(), + BatchIndividualItems(multi_agent=args.num_agents > 0), NumpyToTensor(), ], ) diff --git a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py index 1f7ad8dc238c..aaf0dfb732e6 100644 --- a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py +++ b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py @@ -57,6 +57,7 @@ default_timesteps=200000, default_reward=6.0, ) +parser.set_defaults(num_agents=2) parser.add_argument( "--use-lstm", action="store_true", diff --git a/rllib/examples/rl_modules/classes/tiny_atari_cnn.py b/rllib/examples/rl_modules/classes/tiny_atari_cnn.py index 2f45cf219734..e19d175d28e9 100644 --- a/rllib/examples/rl_modules/classes/tiny_atari_cnn.py +++ b/rllib/examples/rl_modules/classes/tiny_atari_cnn.py @@ -1,8 +1,12 @@ from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.core.rl_module.torch import TorchRLModule +from ray.rllib.models.torch.misc import ( + normc_initializer, + same_padding, + valid_padding, +) from ray.rllib.models.torch.torch_distributions import TorchCategorical -from ray.rllib.models.torch.misc import normc_initializer -from ray.rllib.models.torch.misc import same_padding, valid_padding from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.torch_utils import convert_to_torch_tensor @@ -37,10 +41,9 @@ def setup(self): """ # Get the CNN stack config from our RLModuleConfig's (self.config) # `model_config_dict` property: - if "conv_filters" in self.config.model_config_dict: - conv_filters = self.config.model_config_dict["conv_filters"] + conv_filters = self.config.model_config_dict.get("conv_filters") # Default CNN stack with 3 layers: - else: + if conv_filters is None: conv_filters = [ [16, 4, 2, "same"], # num filters, kernel wxh, stride wxh, padding type [32, 4, 2, "same"], @@ -66,13 +69,10 @@ def setup(self): out_size = valid_padding(in_size, kernel_size, strides) layer = nn.Conv2d(in_depth, out_depth, kernel_size, strides, bias=True) - # Initialize CNN layer kernel. + # Initialize CNN layer kernel and bias. nn.init.xavier_uniform_(layer.weight) - # Initialize CNN layer bias. nn.init.zeros_(layer.bias) - layers.append(layer) - # Activation. layers.append(nn.ReLU()) @@ -83,10 +83,13 @@ def setup(self): # Add the final CNN 1x1 layer with num_filters == num_actions to be reshaped to # yield the logits (no flattening, no additional linear layers required). + _final_conv = nn.Conv2d(in_depth, self.config.action_space.n, 1, 1, bias=True) + nn.init.xavier_uniform_(_final_conv.weight) + nn.init.zeros_(_final_conv.bias) self._logits = nn.Sequential( - nn.ZeroPad2d(same_padding(in_size, 1, 1)[0]), - nn.Conv2d(in_depth, self.config.action_space.n, 1, 1, bias=True), + nn.ZeroPad2d(same_padding(in_size, 1, 1)[0]), _final_conv ) + self._values = nn.Linear(in_depth, 1) # Mimick old API stack behavior of initializing the value function with `normc` # std=0.01. @@ -115,25 +118,10 @@ def _forward_train(self, batch, **kwargs): Columns.VF_PREDS: values, } - # TODO (sven): We still need to define the distibution to use here, even though, - # we have a pretty standard action space (Discrete), which should simply always map - # to a categorical dist. by default. - @override(TorchRLModule) - def get_inference_action_dist_cls(self): - return TorchCategorical - - @override(TorchRLModule) - def get_exploration_action_dist_cls(self): - return TorchCategorical - - @override(TorchRLModule) - def get_train_action_dist_cls(self): - return TorchCategorical - # TODO (sven): In order for this RLModule to work with PPO, we must define # our own `_compute_values()` method. This would become more obvious, if we simply # subclassed the `PPOTorchRLModule` directly here (which we didn't do for - # simplicity and to keep some generality). We might change even get rid of algo- + # simplicity and to keep some generality). We might even get rid of algo- # specific RLModule subclasses altogether in the future and replace them # by mere algo-specific APIs (w/o any actual implementations). def _compute_values(self, batch, device): @@ -151,6 +139,24 @@ def _compute_features_and_logits(self, batch): torch.squeeze(logits, dim=[-1, -2]), ) + # TODO (sven): In order for this RLModule to work with PPO, we must define + # our own `get_..._action_dist_cls()` methods. This would become more obvious, + # if we simply subclassed the `PPOTorchRLModule` directly here (which we didn't do + # for simplicity and to keep some generality). We might even get rid of algo- + # specific RLModule subclasses altogether in the future and replace them + # by mere algo-specific APIs (w/o any actual implementations). + @override(RLModule) + def get_train_action_dist_cls(self): + return TorchCategorical + + @override(RLModule) + def get_exploration_action_dist_cls(self): + return TorchCategorical + + @override(RLModule) + def get_inference_action_dist_cls(self): + return TorchCategorical + if __name__ == "__main__": import numpy as np diff --git a/rllib/examples/rl_modules/custom_rl_module.py b/rllib/examples/rl_modules/custom_rl_module.py index b2f407946071..a75d59960044 100644 --- a/rllib/examples/rl_modules/custom_rl_module.py +++ b/rllib/examples/rl_modules/custom_rl_module.py @@ -98,19 +98,19 @@ # Plug-in our custom RLModule class. rl_module_spec=SingleAgentRLModuleSpec( module_class=TinyAtariCNN, - model_config_dict={"a": "b"}, + # Feel free to specify your own `model_config_dict` settings below. + # The `model_config_dict` defined here will be available inside your + # custom RLModule class through the `self.config.model_config_dict` + # property. + model_config_dict={ + "conv_filters": [ + # num filters, kernel wxh, stride wxh, padding type + [16, 4, 2, "same"], + [32, 4, 2, "same"], + [256, 11, 1, "valid"], + ], + }, ), - # Feel free to specify your own `model_config_dict` settings below. - # The `model_config_dict` defined here will be available inside your custom - # RLModule class through the `self.config.model_config_dict` property. - model_config_dict={ - "conv_filters": [ - # num filters, kernel wxh, stride wxh, padding type - [16, 4, 2, "same"], - [32, 4, 2, "same"], - [64, 4, 2, "same"], - ], - }, ) ) diff --git a/rllib/policy/sample_batch.py b/rllib/policy/sample_batch.py index 24a4359cc17a..10ebbe96b0f0 100644 --- a/rllib/policy/sample_batch.py +++ b/rllib/policy/sample_batch.py @@ -407,7 +407,7 @@ def rows(self) -> Iterator[Dict[str, TensorType]]: for i in range(self.count): yield tree.map_structure_with_path( - lambda p, v: v[i] if p[0] != self.SEQ_LENS else seq_lens, + lambda p, v, i=i: v[i] if p[0] != self.SEQ_LENS else seq_lens, self_as_dict, ) @@ -1468,6 +1468,19 @@ def copy(self) -> "MultiAgentBatch": {k: v.copy() for (k, v) in self.policy_batches.items()}, self.count ) + @ExperimentalAPI + def to_device(self, device, framework="torch"): + """TODO: transfer batch to given device as framework tensor.""" + if framework == "torch": + assert torch is not None + for pid, policy_batch in self.policy_batches.items(): + self.policy_batches[pid] = policy_batch.to_device( + device, framework=framework + ) + else: + raise NotImplementedError + return self + @PublicAPI def size_bytes(self) -> int: """ diff --git a/rllib/tests/test_nested_observation_spaces.py b/rllib/tests/test_nested_observation_spaces.py index c2efe8adc818..b4d236341f71 100644 --- a/rllib/tests/test_nested_observation_spaces.py +++ b/rllib/tests/test_nested_observation_spaces.py @@ -1,14 +1,13 @@ +import pickle +import unittest + from gymnasium import spaces import gymnasium as gym import numpy as np -import pickle -import unittest import ray from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.env import MultiAgentEnv -from ray.rllib.env.base_env import convert_to_base_env -from ray.rllib.env.vector_env import VectorEnv from ray.rllib.models import ModelCatalog from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.models.torch.fcnet import FullyConnectedNetwork @@ -389,114 +388,6 @@ def test_invalid_model(self): lambda: config.build(), ) - def test_invalid_model2(self): - ModelCatalog.register_custom_model("invalid2", InvalidModel2) - config = ( - PPOConfig() - .environment("CartPole-v1") - .framework("tf") - .training(model={"custom_model": "invalid2"}) - ) - self.assertRaisesRegex( - ValueError, - "State output is not a list", - lambda: config.build(), - ) - - def do_test_nested_dict(self, make_env, test_lstm=False): - ModelCatalog.register_custom_model("composite", DictSpyModel) - register_env("nested", make_env) - config = ( - PPOConfig() - .experimental(_disable_preprocessor_api=True) - .environment("nested") - .env_runners(num_env_runners=0, rollout_fragment_length=5) - .framework("tf") - .training( - model={"custom_model": "composite", "use_lstm": test_lstm}, - train_batch_size_per_learner=5, - sgd_minibatch_size=5, - ) - ) - algo = config.build() - # Skip first passes as they came from the TorchPolicy loss - # initialization. - DictSpyModel.capture_index = 0 - algo.train() - - # Check that the model sees the correct reconstructed observations - for i in range(4): - seen = pickle.loads( - ray.experimental.internal_kv._internal_kv_get("d_spy_in_{}".format(i)) - ) - pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() - cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() - task_i = DICT_SAMPLES[i]["inner_state"]["job_status"]["task"] - self.assertEqual(seen[0][0].tolist(), pos_i) - self.assertEqual(seen[1][0].tolist(), cam_i) - check(seen[2][0], task_i) - algo.stop() - - def do_test_nested_tuple(self, make_env): - ModelCatalog.register_custom_model("composite2", TupleSpyModel) - register_env("nested2", make_env) - config = ( - PPOConfig() - .experimental(_disable_preprocessor_api=True) - .environment("nested2") - .env_runners(num_env_runners=0, rollout_fragment_length=5) - .framework("tf") - .training( - model={"custom_model": "composite2"}, - train_batch_size_per_learner=5, - sgd_minibatch_size=5, - ) - ) - - algo = config.build() - # Skip first passes as they came from the TorchPolicy loss - # initialization. - TupleSpyModel.capture_index = 0 - algo.train() - - # Check that the model sees the correct reconstructed observations - for i in range(4): - seen = pickle.loads( - ray.experimental.internal_kv._internal_kv_get("t_spy_in_{}".format(i)) - ) - pos_i = TUPLE_SAMPLES[i][0].tolist() - cam_i = TUPLE_SAMPLES[i][1][0].tolist() - task_i = TUPLE_SAMPLES[i][2] - self.assertEqual(seen[0][0].tolist(), pos_i) - self.assertEqual(seen[1][0].tolist(), cam_i) - check(seen[2][0], task_i) - algo.stop() - - def test_nested_dict_gym(self): - self.do_test_nested_dict(lambda _: NestedDictEnv()) - - def test_nested_dict_gym_lstm(self): - self.do_test_nested_dict(lambda _: NestedDictEnv(), test_lstm=True) - - def test_nested_dict_vector(self): - self.do_test_nested_dict( - lambda _: VectorEnv.vectorize_gym_envs(lambda i: NestedDictEnv()) - ) - - def test_nested_dict_async(self): - self.do_test_nested_dict(lambda _: convert_to_base_env(NestedDictEnv())) - - def test_nested_tuple_gym(self): - self.do_test_nested_tuple(lambda _: NestedTupleEnv()) - - def test_nested_tuple_vector(self): - self.do_test_nested_tuple( - lambda _: VectorEnv.vectorize_gym_envs(lambda i: NestedTupleEnv()) - ) - - def test_nested_tuple_async(self): - self.do_test_nested_tuple(lambda _: convert_to_base_env(NestedTupleEnv())) - def test_torch_model(self): ModelCatalog.register_custom_model("composite", TorchSpyModel) register_env("nested", lambda _: NestedDictEnv()) diff --git a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py index daba926f93a8..c0a9d18eed8b 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py +++ b/rllib/tuned_examples/appo/cartpole-appo-separate-losses.py @@ -9,7 +9,7 @@ stop = { - f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 150, + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 400, f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 200000, } diff --git a/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml b/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml deleted file mode 100644 index aac6c4c6956d..000000000000 --- a/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# @OldAPIStack -cartpole-appo-w-rl-modules-and-learner: - env: CartPole-v1 - run: APPO - stop: - env_runners/episode_return_mean: 150 - timesteps_total: 200000 - config: - # Run with Learner- and RLModule API (new stack). - enable_rl_module_and_learner: true - - # Works for both torch and tf. - framework: torch - num_envs_per_env_runner: 5 - num_env_runners: 1 - num_gpus: 0 - observation_filter: MeanStdFilter - num_sgd_iter: 6 - vf_loss_coeff: 0.01 - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - - # Need to unset this b/c we are using the RLModule API, which - # provides exploration control via the RLModule's `forward_exploration` method. - exploration_config: {} diff --git a/rllib/tuned_examples/impala/cartpole-impala.yaml b/rllib/tuned_examples/impala/cartpole-impala.yaml index 7ae99b2e50b8..c332f1b5f3fa 100644 --- a/rllib/tuned_examples/impala/cartpole-impala.yaml +++ b/rllib/tuned_examples/impala/cartpole-impala.yaml @@ -8,7 +8,7 @@ cartpole-impala: config: enable_rl_module_and_learner: true # Works for both torch and tf. - framework: tf2 + framework: torch num_gpus: 0 grad_clip: 40 num_env_runners: 2 diff --git a/rllib/tuned_examples/impala/cartpole_impala.py b/rllib/tuned_examples/impala/cartpole_impala.py new file mode 100644 index 000000000000..53e74a390d82 --- /dev/null +++ b/rllib/tuned_examples/impala/cartpole_impala.py @@ -0,0 +1,46 @@ +from ray.rllib.algorithms.impala import ImpalaConfig +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +parser = add_rllib_example_script_args() +args = parser.parse_args() + + +config = ( + ImpalaConfig() + # Enable new API stack and use EnvRunner. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + .environment("CartPole-v1") + .training( + train_batch_size_per_learner=500, + grad_clip=40.0, + grad_clip_by="global_norm", + lr=0.0005 * ((args.num_gpus or 1) ** 0.5), + vf_loss_coeff=0.05, + entropy_coeff=0.0, + ) + .rl_module( + model_config_dict={ + "vf_share_layers": True, + "uses_new_env_runners": True, + }, + ) +) + +stop = { + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 450.0, + NUM_ENV_STEPS_SAMPLED_LIFETIME: 2000000, +} + + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args, stop=stop) diff --git a/rllib/tuned_examples/impala/multi_agent_cartpole_impala_envrunner.py b/rllib/tuned_examples/impala/multi_agent_cartpole_impala_envrunner.py new file mode 100644 index 000000000000..4369e28decf6 --- /dev/null +++ b/rllib/tuned_examples/impala/multi_agent_cartpole_impala_envrunner.py @@ -0,0 +1,46 @@ +from ray.rllib.algorithms.impala import ImpalaConfig +from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter +from ray.rllib.env.multi_agent_env_runner import MultiAgentEnvRunner +from ray.rllib.examples.env.multi_agent import MultiAgentCartPole +from ray import tune + +tune.registry.register_env("env", lambda cfg: MultiAgentCartPole(config=cfg)) + + +config = ( + ImpalaConfig() + # Enable new API stack and use EnvRunner. + .experimental(_enable_new_api_stack=True) + .environment("env", env_config={"num_agents": 2}) + .rollouts( + env_runner_cls=MultiAgentEnvRunner, + env_to_module_connector=lambda env: MeanStdFilter(multi_agent=True), + num_envs_per_env_runner=1, + num_rollout_workers=4, + ) + .resources( + num_learner_workers=1, + num_gpus=0, + num_cpus_for_local_worker=1, + ) + .training( + train_batch_size_per_learner=500, + grad_clip=40.0, + grad_clip_by="global_norm", + lr=0.0005, + vf_loss_coeff=0.1, + model={ + "vf_share_layers": True, + "uses_new_env_runners": True, + }, + ) + .multi_agent( + policies=["p0", "p1"], + policy_mapping_fn=(lambda agent_id, episode, **kwargs: f"p{agent_id}"), + ) +) + +stop = { + "sampler_results/episode_reward_mean": 800.0, + "timesteps_total": 400000, +} diff --git a/rllib/tuned_examples/impala/pendulum_impala.py b/rllib/tuned_examples/impala/pendulum_impala.py new file mode 100644 index 000000000000..ff870b44506f --- /dev/null +++ b/rllib/tuned_examples/impala/pendulum_impala.py @@ -0,0 +1,46 @@ +from ray.rllib.algorithms.impala import ImpalaConfig +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +parser = add_rllib_example_script_args() +args = parser.parse_args() + +config = ( + ImpalaConfig() + # Enable new API stack and use EnvRunner. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + .env_runners(num_envs_per_env_runner=5) + .environment("Pendulum-v1") + .training( + train_batch_size_per_learner=256, + grad_clip=40.0, + grad_clip_by="global_norm", + lr=0.0003 * ((args.num_gpus or 1) ** 0.5), + vf_loss_coeff=0.05, + entropy_coeff=[[0, 0.1], [2000000, 0.0]], + ) + .rl_module( + model_config_dict={ + "vf_share_layers": True, + "fcnet_hiddens": [512, 512], + "uses_new_env_runners": True, + }, + ) +) + +stop = { + f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": -200.0, + NUM_ENV_STEPS_SAMPLED_LIFETIME: 5000000, +} + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args, stop=stop) diff --git a/rllib/tuned_examples/impala/pong_impala.py b/rllib/tuned_examples/impala/pong_impala.py new file mode 100644 index 000000000000..89feb5c5b42a --- /dev/null +++ b/rllib/tuned_examples/impala/pong_impala.py @@ -0,0 +1,93 @@ +import gymnasium as gym + +from ray.rllib.algorithms.impala import ImpalaConfig +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack +from ray.rllib.examples.rl_modules.classes.tiny_atari_cnn import TinyAtariCNN +from ray.rllib.utils.test_utils import add_rllib_example_script_args +from ray.tune.registry import register_env + +parser = add_rllib_example_script_args() +parser.set_defaults(env="ALE/Pong-v5") +parser.add_argument( + "--use-tiny-cnn", + action="store_true", + help="Whether to use the old API stack's small CNN Atari architecture, stacking " + "3 CNN layers ([32, 4, 2, same], [64, 4, 2, same], [256, 11, 1, valid]) for the " + "base features and then a CNN pi-head with an output of [num-actions, 1, 1] and " + "a Linear(1) layer for the values. The actual RLModule class used can be found " + "here: ray.rllib.examples.rl_modules.classes.tiny_atari_cnn", +) +args = parser.parse_args() + + +def _env_creator(cfg): + return wrap_atari_for_new_api_stack( + gym.make(args.env, **cfg, **{"render_mode": "rgb_array"}), + dim=42 if args.use_tiny_cnn else 64, + # TODO (sven): Use FrameStacking Connector here for some speedup. + framestack=4, + ) + + +register_env("env", _env_creator) + + +config = ( + ImpalaConfig() + # Enable new API stack and use EnvRunner. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + .environment( + "env", + env_config={ + # Make analogous to old v4 + NoFrameskip. + "frameskip": 1, + "full_action_space": False, + "repeat_action_probability": 0.0, + }, + clip_rewards=True, + ) + .env_runners(num_envs_per_env_runner=5) + .training( + train_batch_size_per_learner=500, + grad_clip=40.0, + grad_clip_by="global_norm", + lr=0.007 * ((args.num_gpus or 1) ** 0.5), + vf_loss_coeff=0.5, + entropy_coeff=0.008, # <- crucial parameter to finetune + # Only update connector states and model weights every n training_step calls. + broadcast_interval=5, + ) + .rl_module( + rl_module_spec=( + SingleAgentRLModuleSpec(module_class=TinyAtariCNN) + if args.use_tiny_cnn + else None + ), + model_config_dict=( + { + "vf_share_layers": True, + "conv_filters": [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]], + "conv_activation": "relu", + "post_fcnet_hiddens": [256], + "uses_new_env_runners": True, + } + if not args.use_tiny_cnn + else {} + ), + ) +) + +stop = { + "env_runner_results/episode_return_mean": 20.0, + "num_env_steps_sampled_lifetime": 5000000, +} + + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args, stop=stop) diff --git a/rllib/tuned_examples/impala/pong_impala_pb2_hyperopt.py b/rllib/tuned_examples/impala/pong_impala_pb2_hyperopt.py new file mode 100644 index 000000000000..b46bbb85468a --- /dev/null +++ b/rllib/tuned_examples/impala/pong_impala_pb2_hyperopt.py @@ -0,0 +1,130 @@ +import gymnasium as gym + +from ray.rllib.algorithms.impala import ImpalaConfig +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack +from ray.rllib.examples.rl_modules.classes.tiny_atari_cnn import TinyAtariCNN +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import add_rllib_example_script_args +from ray.tune.registry import register_env +from ray.tune.schedulers.pb2 import PB2 +from ray import tune + +parser = add_rllib_example_script_args() +parser.set_defaults(env="ALE/Pong-v5") +parser.add_argument( + "--use-tiny-cnn", + action="store_true", + help="Whether to use the old API stack's small CNN Atari architecture, stacking " + "3 CNN layers ([32, 4, 2, same], [64, 4, 2, same], [256, 11, 1, valid]) for the " + "base features and then a CNN pi-head with an output of [num-actions, 1, 1] and " + "a Linear(1) layer for the values. The actual RLModule class used can be found " + "here: ray.rllib.examples.rl_modules.classes.tiny_atari_cnn", +) +args = parser.parse_args() + + +def _env_creator(cfg): + return wrap_atari_for_new_api_stack( + gym.make(args.env, **cfg, **{"render_mode": "rgb_array"}), + dim=42 if args.use_tiny_cnn else 64, + # TODO (sven): Use FrameStacking Connector here for some speedup. + framestack=4, + ) + + +register_env("env", _env_creator) + +pb2_scheduler = PB2( + time_attr=NUM_ENV_STEPS_SAMPLED_LIFETIME, + metric=f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}", + mode="max", + perturbation_interval=50000, + # Copy bottom % with top % weights. + quantile_fraction=0.25, + hyperparam_bounds={ + "lr": [0.0001, 0.02], + "gamma": [0.95, 1.0], + "entropy_coeff": [0.001, 0.025], + "vf_loss_coeff": [0.1, 1.0], + "grad_clip": [10, 200], + "broadcast_interval": [2, 7], + }, +) + +config = ( + ImpalaConfig() + # Enable new API stack and use EnvRunner. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + .environment( + "env", + env_config={ + # Make analogous to old v4 + NoFrameskip. + "frameskip": 1, + "full_action_space": False, + "repeat_action_probability": 0.0, + }, + clip_rewards=True, + ) + .env_runners( + num_envs_per_env_runner=5, + ) + # .training( + # train_batch_size_per_learner=500, + # grad_clip=40.0, + # grad_clip_by="global_norm", + # vf_loss_coeff=0.5, + # entropy_coeff=0.008, + # # Only update connector states and model weights every n training_step calls. + # broadcast_interval=5, + # lr=0.009 * ((args.num_gpus or 1) ** 0.5), + # ) + .training( + train_batch_size_per_learner=tune.randint(256, 1024), + grad_clip=tune.choice([10, 40, 100, 200]), + grad_clip_by="global_norm", + vf_loss_coeff=tune.uniform(0.1, 1.0), + entropy_coeff=tune.choice([0.001, 0.025]), + lr=tune.uniform(0.0001, 0.02), + # Only update connector states and model weights every n training_step calls. + broadcast_interval=tune.randint(2, 7), + gamma=tune.uniform(0.95, 1.0), + ) + .rl_module( + rl_module_spec=( + SingleAgentRLModuleSpec(module_class=TinyAtariCNN) + if args.use_tiny_cnn + else None + ), + model_config_dict=( + { + "vf_share_layers": True, + "conv_filters": [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]], + "conv_activation": "relu", + "post_fcnet_hiddens": [256], + "uses_new_env_runners": True, + } + if not args.use_tiny_cnn + else {} + ), + ) +) + +stop = { + "env_runner_results/episode_return_mean": 21.0, + "num_env_steps_sampled_lifetime": 10000000000, +} + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment( + config, args, stop=stop, scheduler=pb2_scheduler + ) diff --git a/rllib/utils/actor_manager.py b/rllib/utils/actor_manager.py index e32878825983..1ee0d247e907 100644 --- a/rllib/utils/actor_manager.py +++ b/rllib/utils/actor_manager.py @@ -592,7 +592,8 @@ def foreach_actor( `mark_healthy=True`, will send `func` to all actors and mark those actors "healthy" that respond to the request within `timeout_seconds` and are currently tagged as "unhealthy". - remote_actor_ids: Apply func on a selected set of remote actors. + remote_actor_ids: Apply func on a selected set of remote actors. Use None + (default) for all actors. timeout_seconds: Time to wait (in seconds) for results. Set this to 0.0 for fire-and-forget. Set this to None (default) to wait infinitely (i.e. for synchronous execution). diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py index 764edabdb8a2..de4fef852de7 100644 --- a/rllib/utils/metrics/__init__.py +++ b/rllib/utils/metrics/__init__.py @@ -2,13 +2,19 @@ EVALUATION_RESULTS = "evaluation" ENV_RUNNER_RESULTS = "env_runners" REPLAY_BUFFER_RESULTS = "replay_buffer" +LEARNER_GROUP = "learner_group" LEARNER_RESULTS = "learners" FAULT_TOLERANCE_STATS = "fault_tolerance" TIMERS = "timers" # ALGORITHM_RESULTS = "algorithm" +# RLModule metrics +NUM_TRAINABLE_PARAMETERS = "num_trainable_parameters" +NUM_NON_TRAINABLE_PARAMETERS = "num_non_trainable_parameters" + # Counters for sampling, sampling (on eval workers) and # training steps (env- and agent steps). +MEAN_NUM_EPISODE_LISTS_RECEIVED = "mean_num_episode_lists_received" NUM_AGENT_STEPS_SAMPLED = "num_agent_steps_sampled" NUM_AGENT_STEPS_SAMPLED_LIFETIME = "num_agent_steps_sampled_lifetime" NUM_AGENT_STEPS_SAMPLED_THIS_ITER = "num_agent_steps_sampled_this_iter" # @OldAPIStack @@ -30,7 +36,9 @@ EPISODE_RETURN_MIN = "episode_return_min" NUM_EPISODES = "num_episodes" NUM_EPISODES_LIFETIME = "num_episodes_lifetime" +TIME_BETWEEN_SAMPLING = "time_between_sampling" +MEAN_NUM_LEARNER_GROUP_UPDATE_CALLED = "mean_num_learner_group_update_called" NUM_AGENT_STEPS_TRAINED = "num_agent_steps_trained" NUM_AGENT_STEPS_TRAINED_LIFETIME = "num_agent_steps_trained_lifetime" NUM_AGENT_STEPS_TRAINED_THIS_ITER = "num_agent_steps_trained_this_iter" # @OldAPIStack @@ -48,6 +56,11 @@ NUM_TRAINING_STEP_CALLS_SINCE_LAST_SYNCH_WORKER_WEIGHTS = ( "num_training_step_calls_since_last_synch_worker_weights" ) +# The running sequence number for a set of NN weights. If a worker's NN has a +# lower sequence number than some weights coming in for an update, the worker +# should perform the update, otherwise ignore the incoming weights (they are older +# or the same) as/than the ones it already has. +WEIGHTS_SEQ_NO = "weights_seq_no" # Number of total gradient updates that have been performed on a policy. NUM_GRAD_UPDATES_LIFETIME = "num_grad_updates_lifetime" # Average difference between the number of grad-updates that the policy/ies had diff --git a/rllib/utils/metrics/stats.py b/rllib/utils/metrics/stats.py index b6cb3fae0133..fbff2b1fb83b 100644 --- a/rllib/utils/metrics/stats.py +++ b/rllib/utils/metrics/stats.py @@ -624,12 +624,24 @@ def _reduced_values(self, values=None, window=None) -> Tuple[Any, Any]: else: # Use the numpy/torch "nan"-prefix to ignore NaN's in our value lists. if torch and torch.is_tensor(values[0]): + # TODO (sven): Currently, tensor metrics only work with window=1. + # We might want o enforce it more formally, b/c it's probably not a + # good idea to have MetricsLogger or Stats tinker with the actual + # computation graph that users are trying to build in their loss + # functions. + assert len(values) == 1 assert all(torch.is_tensor(v) for v in values), values - reduce_meth = getattr(torch, "nan" + self._reduce_method) - reduce_in = torch.stack(values) - if self._reduce_method == "mean": - reduce_in = reduce_in.float() - reduced = reduce_meth(reduce_in) + # TODO (sven) If the shape is (), do NOT even use the reduce method. + # Using `tf.reduce_mean()` here actually lead to a completely broken + # DreamerV3 (for a still unknown exact reason). + if len(values[0].shape) == 0: + reduced = values[0] + else: + reduce_meth = getattr(torch, "nan" + self._reduce_method) + reduce_in = torch.stack(values) + if self._reduce_method == "mean": + reduce_in = reduce_in.float() + reduced = reduce_meth(reduce_in) elif tf and tf.is_tensor(values[0]): # TODO (sven): Currently, tensor metrics only work with window=1. # We might want o enforce it more formally, b/c it's probably not a diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py index 0849d3900c26..8c576234e436 100644 --- a/rllib/utils/minibatch_utils.py +++ b/rllib/utils/minibatch_utils.py @@ -255,3 +255,35 @@ def __iter__(self): for sublist in sublists: yield sublist + + +@DeveloperAPI +class ShardObjectRefIterator: + """Iterator for sharding a list of ray ObjectRefs into num_shards sub-lists. + + Args: + object_refs: The input list of ray ObjectRefs. + num_shards: The number of shards to split the references into. + + Yields: + A sub-list of ray ObjectRefs with lengths as equal as possible. + """ + + def __init__(self, object_refs, num_shards: int): + self._object_refs = object_refs + self._num_shards = num_shards + + def __iter__(self): + # Calculate the size of each sublist + n = len(self._object_refs) + sublist_size = n // self._num_shards + remaining_elements = n % self._num_shards + + start = 0 + for i in range(self._num_shards): + # Determine the end index for the current sublist + end = start + sublist_size + (1 if i < remaining_elements else 0) + # Append the sublist to the result + yield self._object_refs[start:end] + # Update the start index for the next sublist + start = end diff --git a/rllib/utils/postprocessing/episodes.py b/rllib/utils/postprocessing/episodes.py index 3f5c61b2db9c..7c01f18cc0cb 100644 --- a/rllib/utils/postprocessing/episodes.py +++ b/rllib/utils/postprocessing/episodes.py @@ -114,7 +114,7 @@ def remove_last_ts_from_data( ret = [] for d in data: ret.append(np.concatenate([d[s] for s in slices])) - return tuple(ret) + return tuple(ret) if len(ret) > 1 else ret[0] @DeveloperAPI diff --git a/rllib/utils/postprocessing/value_predictions.py b/rllib/utils/postprocessing/value_predictions.py index 7bf2200b1f22..0c0f88e7f49e 100644 --- a/rllib/utils/postprocessing/value_predictions.py +++ b/rllib/utils/postprocessing/value_predictions.py @@ -75,6 +75,8 @@ def extract_bootstrapped_values(vf_preds, episode_lengths, T): ) # Loop over all episode lengths and collect bootstrap values. + # Do not alter incoming `episode_lengths` list. + episode_lengths = episode_lengths[:] i = -1 while i < len(episode_lengths) - 1: i += 1 diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index eaed1c3d0590..6fdc5beeabcf 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -275,7 +275,14 @@ def add_rllib_example_script_args( # Learner scaling options. # Old API stack: config.num_gpus. # New API stack: config.num_learners (w/ num_gpus_per_learner=1). - parser.add_argument("--num-gpus", type=int, default=0) + parser.add_argument( + "--num-gpus", + type=int, + default=0, + help="The number of GPUs/Learners to use. If none or not enough GPUs " + "are available, will still create `--num-gpus` Learners, but place them on one " + "CPU each, instead.", + ) # Ray init options. parser.add_argument("--num-cpus", type=int, default=0) @@ -1360,6 +1367,7 @@ def run_rllib_example_script_experiment( trainable: Optional[Type] = None, tune_callbacks: Optional[List] = None, keep_config: bool = False, + scheduler=None, ) -> Union[ResultDict, tune.result_grid.ResultGrid]: """Given an algorithm config and some command line args, runs an experiment. @@ -1468,14 +1476,16 @@ def run_rllib_example_script_experiment( config.resources(num_gpus=0) config.learners( num_learners=args.num_gpus, - num_gpus_per_learner=1 if torch.cuda.is_available() else 0, + num_gpus_per_learner=( + 1 + if torch and torch.cuda.is_available() and args.num_gpus > 0 + else 0 + ), ) + config.resources(num_gpus=0) # Old stack. else: - config.resources( - num_gpus=args.num_gpus, - num_cpus_for_main_process=1, - ) + config.resources(num_gpus=args.num_gpus) # Evaluation setup. if args.evaluation_interval > 0: @@ -1577,7 +1587,10 @@ def run_rllib_example_script_experiment( ), progress_reporter=progress_reporter, ), - tune_config=tune.TuneConfig(num_samples=args.num_samples), + tune_config=tune.TuneConfig( + num_samples=args.num_samples, + scheduler=scheduler, + ), ).fit() time_taken = time.time() - start_time diff --git a/rllib/utils/torch_utils.py b/rllib/utils/torch_utils.py index a8e77c9af356..94c24716e549 100644 --- a/rllib/utils/torch_utils.py +++ b/rllib/utils/torch_utils.py @@ -221,12 +221,19 @@ def convert_to_non_torch_type(stats: TensorStructType) -> TensorStructType: @PublicAPI -def convert_to_torch_tensor(x: TensorStructType, device: Optional[str] = None): +def convert_to_torch_tensor( + x: TensorStructType, + device: Optional[str] = None, + pin_memory: bool = False, +): """Converts any struct to torch.Tensors. - x: Any (possibly nested) struct, the values in which will be - converted and returned as a new struct with all leaves converted - to torch tensors. + Args: + x: Any (possibly nested) struct, the values in which will be + converted and returned as a new struct with all leaves converted + to torch tensors. + device: The device to create the tensor on. + pin_memory: If True, will call the `pin_memory()` method on the created tensors. Returns: Any: A new struct with the same structure as `x`, but with all @@ -270,6 +277,10 @@ def mapping(item): if tensor.is_floating_point(): tensor = tensor.float() + # Pin the tensor's memory (for faster transfer to GPU later). + if pin_memory and torch.cuda.is_available(): + tensor.pin_memory() + return tensor if device is None else tensor.to(device) return tree.map_structure(mapping, x)