diff --git a/rllib/BUILD b/rllib/BUILD index 684873b15da6..bed02f025a09 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1898,13 +1898,6 @@ py_test( srcs = ["utils/postprocessing/tests/test_value_predictions.py"] ) -py_test( - name = "test_random_encoder", - tags = ["team:rllib", "utils"], - size = "large", - srcs = ["utils/exploration/tests/test_random_encoder.py"] -) - py_test( name = "test_torch_utils", tags = ["team:rllib", "utils", "gpu"], diff --git a/rllib/algorithms/appo/tests/test_appo.py b/rllib/algorithms/appo/tests/test_appo.py index e9c114bbc2e7..505319218577 100644 --- a/rllib/algorithms/appo/tests/test_appo.py +++ b/rllib/algorithms/appo/tests/test_appo.py @@ -4,11 +4,7 @@ import ray.rllib.algorithms.appo as appo from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY -from ray.rllib.utils.test_utils import ( - check_compute_single_action, - check_train_results, - framework_iterator, -) +from ray.rllib.utils.test_utils import check_compute_single_action, check_train_results class TestAPPO(unittest.TestCase): @@ -25,15 +21,14 @@ def test_appo_compilation(self): config = appo.APPOConfig().env_runners(num_env_runners=1) num_iterations = 2 - for _ in framework_iterator(config): - algo = config.build(env="CartPole-v1") - for i in range(num_iterations): - results = algo.train() - print(results) - check_train_results(results) + algo = config.build(env="CartPole-v1") + for i in range(num_iterations): + results = algo.train() + print(results) + check_train_results(results) - check_compute_single_action(algo) - algo.stop() + check_compute_single_action(algo) + algo.stop() def test_appo_compilation_use_kl_loss(self): """Test whether APPO can be built with kl_loss enabled.""" @@ -42,14 +37,13 @@ def test_appo_compilation_use_kl_loss(self): ) num_iterations = 2 - for _ in framework_iterator(config): - algo = config.build(env="CartPole-v1") - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) - algo.stop() + algo = config.build(env="CartPole-v1") + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + check_compute_single_action(algo) + algo.stop() def test_appo_two_optimizers_two_lrs(self): # Not explicitly setting this should cause a warning, but not fail. @@ -71,14 +65,13 @@ def test_appo_two_optimizers_two_lrs(self): num_iterations = 2 # Only supported for tf so far. - for _ in framework_iterator(config, frameworks=("torch", "tf2", "tf")): - algo = config.build(env="CartPole-v1") - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) - algo.stop() + algo = config.build(env="CartPole-v1") + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + check_compute_single_action(algo) + algo.stop() def test_appo_entropy_coeff_schedule(self): # Initial lr, doesn't really matter because of the schedule below. @@ -122,19 +115,18 @@ def _step_n_times(algo, n: int): "entropy_coeff" ] - for _ in framework_iterator(config, frameworks=("torch", "tf")): - algo = config.build(env="CartPole-v1") + algo = config.build(env="CartPole-v1") - coeff = _step_n_times(algo, 10) # 200 timesteps - # Should be close to the starting coeff of 0.01. - self.assertLessEqual(coeff, 0.01) - self.assertGreaterEqual(coeff, 0.001) + coeff = _step_n_times(algo, 10) # 200 timesteps + # Should be close to the starting coeff of 0.01. + self.assertLessEqual(coeff, 0.01) + self.assertGreaterEqual(coeff, 0.001) - coeff = _step_n_times(algo, 20) # 400 timesteps - # Should have annealed to the final coeff of 0.0001. - self.assertLessEqual(coeff, 0.001) + coeff = _step_n_times(algo, 20) # 400 timesteps + # Should have annealed to the final coeff of 0.0001. + self.assertLessEqual(coeff, 0.001) - algo.stop() + algo.stop() def test_appo_learning_rate_schedule(self): config = ( @@ -173,15 +165,14 @@ def _step_n_times(algo, n: int): "cur_lr" ] - for _ in framework_iterator(config): - algo = config.build(env="CartPole-v1") + algo = config.build(env="CartPole-v1") - lr1 = _step_n_times(algo, 10) # 200 timesteps - lr2 = _step_n_times(algo, 10) # 200 timesteps + lr1 = _step_n_times(algo, 10) # 200 timesteps + lr2 = _step_n_times(algo, 10) # 200 timesteps - self.assertGreater(lr1, lr2) + self.assertGreater(lr1, lr2) - algo.stop() + algo.stop() def test_appo_model_variables(self): config = ( @@ -202,13 +193,12 @@ def test_appo_model_variables(self): ) ) - for _ in framework_iterator(config, frameworks=["tf2", "torch"]): - algo = config.build(env="CartPole-v1") - state = algo.get_policy(DEFAULT_POLICY_ID).get_state() - # Weights and Biases for the single hidden layer, the output layer - # of the policy and value networks. So 6 tensors in total. - # We should not get the tensors from the target model here. - self.assertEqual(len(state["weights"]), 6) + algo = config.build(env="CartPole-v1") + state = algo.get_policy(DEFAULT_POLICY_ID).get_state() + # Weights and Biases for the single hidden layer, the output layer + # of the policy and value networks. So 6 tensors in total. + # We should not get the tensors from the target model here. + self.assertEqual(len(state["weights"]), 6) if __name__ == "__main__": diff --git a/rllib/algorithms/bc/tests/test_bc_old_api_stack.py b/rllib/algorithms/bc/tests/test_bc_old_api_stack.py index 98621e8be8bc..d564121fe028 100644 --- a/rllib/algorithms/bc/tests/test_bc_old_api_stack.py +++ b/rllib/algorithms/bc/tests/test_bc_old_api_stack.py @@ -11,7 +11,6 @@ from ray.rllib.utils.test_utils import ( check_compute_single_action, check_train_results, - framework_iterator, ) @@ -50,48 +49,42 @@ def test_bc_compilation_and_learning_from_offline_file(self): num_iterations = 350 min_return_to_reach = 75.0 - # Test for the following frameworks. - frameworks_to_test = ("torch", "tf") - - for _ in framework_iterator(config, frameworks=frameworks_to_test): - for recurrent in [True, False]: - # We only test recurrent networks with RLModules. - if recurrent: - # TODO (Artur): We read input data without a time-dimensions. - # In order for a recurrent offline learning RL Module to - # work, the input data needs to be transformed do add a - # time-dimension. - continue - - config.training(model={"use_lstm": recurrent}) - algo = config.build(env="CartPole-v1") - learnt = False - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - - eval_results = results.get("evaluation") - if eval_results: - mean_return = eval_results[ENV_RUNNER_RESULTS][ - EPISODE_RETURN_MEAN - ] - print("iter={} R={}".format(i, mean_return)) - # Learn until good reward is reached in the actual env. - if mean_return > min_return_to_reach: - print("learnt!") - learnt = True - break - - if not learnt: - raise ValueError( - "`BC` did not reach {} reward from expert offline " - "data!".format(min_return_to_reach) - ) - - check_compute_single_action(algo, include_prev_action_reward=True) - - algo.stop() + for recurrent in [True, False]: + # We only test recurrent networks with RLModules. + if recurrent: + # TODO (Artur): We read input data without a time-dimensions. + # In order for a recurrent offline learning RL Module to + # work, the input data needs to be transformed do add a + # time-dimension. + continue + + config.training(model={"use_lstm": recurrent}) + algo = config.build(env="CartPole-v1") + learnt = False + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + + eval_results = results.get("evaluation") + if eval_results: + mean_return = eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + print("iter={} R={}".format(i, mean_return)) + # Learn until good reward is reached in the actual env. + if mean_return > min_return_to_reach: + print("learnt!") + learnt = True + break + + if not learnt: + raise ValueError( + "`BC` did not reach {} reward from expert offline " + "data!".format(min_return_to_reach) + ) + + check_compute_single_action(algo, include_prev_action_reward=True) + + algo.stop() if __name__ == "__main__": diff --git a/rllib/algorithms/cql/tests/test_cql.py b/rllib/algorithms/cql/tests/test_cql.py index 5ed4f007f52a..60ce30a74f1c 100644 --- a/rllib/algorithms/cql/tests/test_cql.py +++ b/rllib/algorithms/cql/tests/test_cql.py @@ -1,23 +1,17 @@ -import numpy as np from pathlib import Path import os import unittest import ray from ray.rllib.algorithms import cql -from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, EVALUATION_RESULTS, ) -from ray.rllib.utils.test_utils import ( - check_compute_single_action, - check_train_results, - framework_iterator, -) +from ray.rllib.utils.test_utils import check_compute_single_action, check_train_results -tf1, tf, tfv = try_import_tf() torch, _ = try_import_torch() @@ -75,77 +69,51 @@ def test_cql_compilation(self): ) num_iterations = 4 - # Test for tf/torch frameworks. - for fw in framework_iterator(config): - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - eval_results = results.get(EVALUATION_RESULTS) - if eval_results: - print( - f"iter={algo.iteration} " - f"R={eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}" - ) - check_compute_single_action(algo) - - # Get policy and model. - pol = algo.get_policy() - cql_model = pol.model - if fw == "tf": - pol.get_session().__enter__() - - # Example on how to do evaluation on the trained Algorithm - # using the data from CQL's global replay buffer. - # Get a sample (MultiAgentBatch). - - batch = algo.env_runner.input_reader.next() - multi_agent_batch = batch.as_multi_agent() - # All experiences have been buffered for `default_policy` - batch = multi_agent_batch.policy_batches["default_policy"] - - if fw == "torch": - obs = torch.from_numpy(batch["obs"]) - else: - obs = batch["obs"] - batch["actions"] = batch["actions"].astype(np.float32) - - # Pass the observations through our model to get the - # features, which then to pass through the Q-head. - model_out, _ = cql_model({"obs": obs}) - # The estimated Q-values from the (historic) actions in the batch. - if fw == "torch": - q_values_old = cql_model.get_q_values( - model_out, torch.from_numpy(batch["actions"]) - ) - else: - q_values_old = cql_model.get_q_values( - tf.convert_to_tensor(model_out), batch["actions"] + algo = config.build() + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + eval_results = results.get(EVALUATION_RESULTS) + if eval_results: + print( + f"iter={algo.iteration} " + f"R={eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]}" ) + check_compute_single_action(algo) - # The estimated Q-values for the new actions computed - # by our policy. - actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0] - if fw == "torch": - q_values_new = cql_model.get_q_values( - model_out, torch.from_numpy(actions_new) - ) - else: - q_values_new = cql_model.get_q_values(model_out, actions_new) + # Get policy and model. + pol = algo.get_policy() + cql_model = pol.model - if fw == "tf": - q_values_old, q_values_new = pol.get_session().run( - [q_values_old, q_values_new] - ) + # Example on how to do evaluation on the trained Algorithm + # using the data from CQL's global replay buffer. + # Get a sample (MultiAgentBatch). + + batch = algo.env_runner.input_reader.next() + multi_agent_batch = batch.as_multi_agent() + # All experiences have been buffered for `default_policy` + batch = multi_agent_batch.policy_batches["default_policy"] + + obs = torch.from_numpy(batch["obs"]) + + # Pass the observations through our model to get the + # features, which then to pass through the Q-head. + model_out, _ = cql_model({"obs": obs}) + # The estimated Q-values from the (historic) actions in the batch. + q_values_old = cql_model.get_q_values( + model_out, torch.from_numpy(batch["actions"]) + ) - print(f"Q-val batch={q_values_old}") - print(f"Q-val policy={q_values_new}") + # The estimated Q-values for the new actions computed + # by our policy. + actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0] + q_values_new = cql_model.get_q_values(model_out, torch.from_numpy(actions_new)) - if fw == "tf": - pol.get_session().__exit__(None, None, None) + print(f"Q-val batch={q_values_old}") + print(f"Q-val policy={q_values_new}") - algo.stop() + algo.stop() if __name__ == "__main__": diff --git a/rllib/algorithms/dqn/tests/test_dqn.py b/rllib/algorithms/dqn/tests/test_dqn.py index b2472d24e03f..c442f731b816 100644 --- a/rllib/algorithms/dqn/tests/test_dqn.py +++ b/rllib/algorithms/dqn/tests/test_dqn.py @@ -8,7 +8,6 @@ check, check_compute_single_action, check_train_results, - framework_iterator, ) @@ -31,32 +30,31 @@ def test_dqn_compilation(self): .training(num_steps_sampled_before_learning_starts=0) ) - for _ in framework_iterator(config): - # Double-dueling DQN. - print("Double-dueling") - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - - check_compute_single_action(algo) - algo.stop() - - # Rainbow. - print("Rainbow") - rainbow_config = deepcopy(config).training( - num_atoms=10, noisy=True, double_q=True, dueling=True, n_step=5 - ) - algo = rainbow_config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - - check_compute_single_action(algo) - - algo.stop() + # Double-dueling DQN. + print("Double-dueling") + algo = config.build() + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + + check_compute_single_action(algo) + algo.stop() + + # Rainbow. + print("Rainbow") + rainbow_config = deepcopy(config).training( + num_atoms=10, noisy=True, double_q=True, dueling=True, n_step=5 + ) + algo = rainbow_config.build() + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + + check_compute_single_action(algo) + + algo.stop() def test_dqn_compilation_integer_rewards(self): """Test whether DQN can be built on all frameworks. @@ -70,32 +68,31 @@ def test_dqn_compilation_integer_rewards(self): .training(num_steps_sampled_before_learning_starts=0) ) - for _ in framework_iterator(config): - # Double-dueling DQN. - print("Double-dueling") - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - - check_compute_single_action(algo) - algo.stop() - - # Rainbow. - print("Rainbow") - rainbow_config = deepcopy(config).training( - num_atoms=10, noisy=True, double_q=True, dueling=True, n_step=5 - ) - algo = rainbow_config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - - check_compute_single_action(algo) - - algo.stop() + # Double-dueling DQN. + print("Double-dueling") + algo = config.build() + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + + check_compute_single_action(algo) + algo.stop() + + # Rainbow. + print("Rainbow") + rainbow_config = deepcopy(config).training( + num_atoms=10, noisy=True, double_q=True, dueling=True, n_step=5 + ) + algo = rainbow_config.build() + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + + check_compute_single_action(algo) + + algo.stop() def test_dqn_exploration_and_soft_q_config(self): """Tests, whether a DQN Agent outputs exploration/softmaxed actions.""" @@ -108,62 +105,60 @@ def test_dqn_exploration_and_soft_q_config(self): obs = np.array(0) - # Test against all frameworks. - for _ in framework_iterator(config): - # Default EpsilonGreedy setup. - algo = config.build() - # Setting explore=False should always return the same action. - a_ = algo.compute_single_action(obs, explore=False) - for _ in range(50): - a = algo.compute_single_action(obs, explore=False) - check(a, a_) - # explore=None (default: explore) should return different actions. - actions = [] - for _ in range(50): - actions.append(algo.compute_single_action(obs)) - check(np.std(actions), 0.0, false=True) - algo.stop() - - # Low softmax temperature. Behaves like argmax - # (but no epsilon exploration). - config.env_runners( - exploration_config={"type": "SoftQ", "temperature": 0.000001} - ) - algo = config.build() - # Due to the low temp, always expect the same action. - actions = [algo.compute_single_action(obs)] - for _ in range(50): - actions.append(algo.compute_single_action(obs)) - check(np.std(actions), 0.0, decimals=3) - algo.stop() - - # Higher softmax temperature. - config.exploration_config["temperature"] = 1.0 - algo = config.build() - - # Even with the higher temperature, if we set explore=False, we - # should expect the same actions always. - a_ = algo.compute_single_action(obs, explore=False) - for _ in range(50): - a = algo.compute_single_action(obs, explore=False) - check(a, a_) - - # Due to the higher temp, expect different actions avg'ing - # around 1.5. - actions = [] - for _ in range(300): - actions.append(algo.compute_single_action(obs)) - check(np.std(actions), 0.0, false=True) - algo.stop() - - # With Random exploration. - config.env_runners(exploration_config={"type": "Random"}, explore=True) - algo = config.build() - actions = [] - for _ in range(300): - actions.append(algo.compute_single_action(obs)) - check(np.std(actions), 0.0, false=True) - algo.stop() + # Default EpsilonGreedy setup. + algo = config.build() + # Setting explore=False should always return the same action. + a_ = algo.compute_single_action(obs, explore=False) + for _ in range(50): + a = algo.compute_single_action(obs, explore=False) + check(a, a_) + # explore=None (default: explore) should return different actions. + actions = [] + for _ in range(50): + actions.append(algo.compute_single_action(obs)) + check(np.std(actions), 0.0, false=True) + algo.stop() + + # Low softmax temperature. Behaves like argmax + # (but no epsilon exploration). + config.env_runners( + exploration_config={"type": "SoftQ", "temperature": 0.000001} + ) + algo = config.build() + # Due to the low temp, always expect the same action. + actions = [algo.compute_single_action(obs)] + for _ in range(50): + actions.append(algo.compute_single_action(obs)) + check(np.std(actions), 0.0, decimals=3) + algo.stop() + + # Higher softmax temperature. + config.exploration_config["temperature"] = 1.0 + algo = config.build() + + # Even with the higher temperature, if we set explore=False, we + # should expect the same actions always. + a_ = algo.compute_single_action(obs, explore=False) + for _ in range(50): + a = algo.compute_single_action(obs, explore=False) + check(a, a_) + + # Due to the higher temp, expect different actions avg'ing + # around 1.5. + actions = [] + for _ in range(300): + actions.append(algo.compute_single_action(obs)) + check(np.std(actions), 0.0, false=True) + algo.stop() + + # With Random exploration. + config.env_runners(exploration_config={"type": "Random"}, explore=True) + algo = config.build() + actions = [] + for _ in range(300): + actions.append(algo.compute_single_action(obs)) + check(np.std(actions), 0.0, false=True) + algo.stop() if __name__ == "__main__": diff --git a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py index f9919816ea13..7fbb8fd55c2a 100644 --- a/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py +++ b/rllib/algorithms/dreamerv3/tests/test_dreamerv3.py @@ -21,7 +21,6 @@ from ray.rllib.algorithms.dreamerv3 import dreamerv3 from ray.rllib.core import DEFAULT_MODULE_ID from ray.rllib.utils.numpy import one_hot -from ray.rllib.utils.test_utils import framework_iterator from ray import tune @@ -186,57 +185,54 @@ def test_dreamerv3_dreamer_model_sizes(self): symlog_obs=True, ) - for _ in framework_iterator(config, frameworks="tf2"): - # Check all model_sizes described in the paper ([1]) on matching the number - # of parameters to RLlib's implementation. - for model_size in ["XS", "S", "M", "L", "XL"]: - config.model_size = model_size - - # Atari and CartPole spaces. - for obs_space, num_actions, env_name in [ - (gym.spaces.Box(-1.0, 0.0, (4,), np.float32), 2, "cartpole"), - (gym.spaces.Box(-1.0, 0.0, (64, 64, 3), np.float32), 6, "atari"), - ]: - print(f"Testing model_size={model_size} on env-type: {env_name} ..") - config.environment( - observation_space=obs_space, - action_space=gym.spaces.Discrete(num_actions), - ) + # Check all model_sizes described in the paper ([1]) on matching the number + # of parameters to RLlib's implementation. + for model_size in ["XS", "S", "M", "L", "XL"]: + config.model_size = model_size + + # Atari and CartPole spaces. + for obs_space, num_actions, env_name in [ + (gym.spaces.Box(-1.0, 0.0, (4,), np.float32), 2, "cartpole"), + (gym.spaces.Box(-1.0, 0.0, (64, 64, 3), np.float32), 6, "atari"), + ]: + print(f"Testing model_size={model_size} on env-type: {env_name} ..") + config.environment( + observation_space=obs_space, + action_space=gym.spaces.Discrete(num_actions), + ) - # Create our RLModule to compute actions with. - policy_dict, _ = config.get_multi_agent_setup() - module_spec = config.get_multi_rl_module_spec( - policy_dict=policy_dict - ) - rl_module = module_spec.build()[DEFAULT_MODULE_ID] + # Create our RLModule to compute actions with. + policy_dict, _ = config.get_multi_agent_setup() + module_spec = config.get_multi_rl_module_spec(policy_dict=policy_dict) + rl_module = module_spec.build()[DEFAULT_MODULE_ID] - # Count the generated RLModule's parameters and compare to the - # paper's reported numbers ([1] and [3]). - num_params_world_model = sum( - np.prod(v.shape.as_list()) - for v in rl_module.world_model.trainable_variables - ) - self.assertEqual( - num_params_world_model, - expected_num_params_world_model[f"{model_size}_{env_name}"], - ) - num_params_actor = sum( - np.prod(v.shape.as_list()) - for v in rl_module.actor.trainable_variables - ) - self.assertEqual( - num_params_actor, - expected_num_params_actor[f"{model_size}_{env_name}"], - ) - num_params_critic = sum( - np.prod(v.shape.as_list()) - for v in rl_module.critic.trainable_variables - ) - self.assertEqual( - num_params_critic, - expected_num_params_critic[f"{model_size}_{env_name}"], - ) - print("\tok") + # Count the generated RLModule's parameters and compare to the + # paper's reported numbers ([1] and [3]). + num_params_world_model = sum( + np.prod(v.shape.as_list()) + for v in rl_module.world_model.trainable_variables + ) + self.assertEqual( + num_params_world_model, + expected_num_params_world_model[f"{model_size}_{env_name}"], + ) + num_params_actor = sum( + np.prod(v.shape.as_list()) + for v in rl_module.actor.trainable_variables + ) + self.assertEqual( + num_params_actor, + expected_num_params_actor[f"{model_size}_{env_name}"], + ) + num_params_critic = sum( + np.prod(v.shape.as_list()) + for v in rl_module.critic.trainable_variables + ) + self.assertEqual( + num_params_critic, + expected_num_params_critic[f"{model_size}_{env_name}"], + ) + print("\tok") if __name__ == "__main__": diff --git a/rllib/algorithms/impala/tests/test_impala.py b/rllib/algorithms/impala/tests/test_impala.py index 4e30d9805afe..5f39f4bf5fe7 100644 --- a/rllib/algorithms/impala/tests/test_impala.py +++ b/rllib/algorithms/impala/tests/test_impala.py @@ -3,17 +3,13 @@ import ray import ray.rllib.algorithms.impala as impala from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY from ray.rllib.utils.test_utils import ( check, check_compute_single_action, check_train_results, - framework_iterator, ) -tf1, tf, tfv = try_import_tf() - class TestIMPALA(unittest.TestCase): @classmethod @@ -40,29 +36,28 @@ def test_impala_compilation(self): ) num_iterations = 2 - for _ in framework_iterator(config): - for lstm in [False, True]: - config.num_aggregation_workers = 0 if not lstm else 1 - config.model["use_lstm"] = lstm - print( - "lstm={} aggregation-workers={}".format( - lstm, config.num_aggregation_workers - ) + for lstm in [False, True]: + config.num_aggregation_workers = 0 if not lstm else 1 + config.model["use_lstm"] = lstm + print( + "lstm={} aggregation-workers={}".format( + lstm, config.num_aggregation_workers ) - # Test with and w/o aggregation workers (this has nothing - # to do with LSTMs, though). - algo = config.build() - for i in range(num_iterations): - results = algo.train() - print(results) - check_train_results(results) + ) + # Test with and w/o aggregation workers (this has nothing + # to do with LSTMs, though). + algo = config.build() + for i in range(num_iterations): + results = algo.train() + print(results) + check_train_results(results) - check_compute_single_action( - algo, - include_state=lstm, - include_prev_action_reward=lstm, - ) - algo.stop() + check_compute_single_action( + algo, + include_state=lstm, + include_prev_action_reward=lstm, + ) + algo.stop() def test_impala_lr_schedule(self): # Test whether we correctly ignore the "lr" setting. @@ -87,32 +82,28 @@ def get_lr(result): "cur_lr" ] - for fw in framework_iterator(config): - algo = config.build() - policy = algo.get_policy() + algo = config.build() + policy = algo.get_policy() - try: - if fw == "tf": - check(policy.get_session().run(policy.cur_lr), 0.05) - else: - check(policy.cur_lr, 0.05) - for _ in range(1): - r1 = algo.train() - for _ in range(2): - r2 = algo.train() - for _ in range(2): - r3 = algo.train() - # Due to the asynch'ness of IMPALA, learner-stats metrics - # could be delayed by one iteration. Do 3 train() calls here - # and measure guaranteed decrease in lr between 1st and 3rd. - lr1 = get_lr(r1) - lr2 = get_lr(r2) - lr3 = get_lr(r3) - assert lr2 <= lr1, (lr1, lr2) - assert lr3 <= lr2, (lr2, lr3) - assert lr3 < lr1, (lr1, lr3) - finally: - algo.stop() + try: + check(policy.cur_lr, 0.05) + for _ in range(1): + r1 = algo.train() + for _ in range(2): + r2 = algo.train() + for _ in range(2): + r3 = algo.train() + # Due to the asynch'ness of IMPALA, learner-stats metrics + # could be delayed by one iteration. Do 3 train() calls here + # and measure guaranteed decrease in lr between 1st and 3rd. + lr1 = get_lr(r1) + lr2 = get_lr(r2) + lr3 = get_lr(r3) + assert lr2 <= lr1, (lr1, lr2) + assert lr3 <= lr2, (lr2, lr3) + assert lr3 < lr1, (lr1, lr3) + finally: + algo.stop() if __name__ == "__main__": diff --git a/rllib/algorithms/impala/tests/test_vtrace.py b/rllib/algorithms/impala/tests/test_vtrace.py index 6c9a9998b711..a767ed61ca10 100644 --- a/rllib/algorithms/impala/tests/test_vtrace.py +++ b/rllib/algorithms/impala/tests/test_vtrace.py @@ -24,13 +24,11 @@ import numpy as np import unittest -from ray.rllib.algorithms.impala import vtrace_tf as vtrace_tf from ray.rllib.algorithms.impala import vtrace_torch as vtrace_torch -from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.numpy import softmax -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check -tf1, tf, tfv = try_import_tf() torch, nn = try_import_torch() @@ -124,41 +122,33 @@ def test_log_probs_from_logits_and_actions(self): num_actions = 3 batch_size = 4 - for fw, sess in framework_iterator(frameworks=("torch", "tf"), session=True): - vtrace = vtrace_tf if fw != "torch" else vtrace_torch - policy_logits = Box( - -1.0, 1.0, (seq_len, batch_size, num_actions), np.float32 - ).sample() - actions = np.random.randint( - 0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32 - ) + vtrace = vtrace_torch + policy_logits = Box( + -1.0, 1.0, (seq_len, batch_size, num_actions), np.float32 + ).sample() + actions = np.random.randint( + 0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32 + ) - if fw == "torch": - action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions( - torch.from_numpy(policy_logits), torch.from_numpy(actions) - ) - else: - action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions( - policy_logits, actions - ) - - # Ground Truth - # Using broadcasting to create a mask that indexes action logits - action_index_mask = actions[..., None] == np.arange(num_actions) - - def index_with_mask(array, mask): - return array[mask].reshape(*array.shape[:-1]) - - # Note: Normally log(softmax) is not a good idea because it's not - # numerically stable. However, in this test we have well-behaved - # values. - ground_truth_v = index_with_mask( - np.log(softmax(policy_logits)), action_index_mask - ) + action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions( + torch.from_numpy(policy_logits), torch.from_numpy(actions) + ) + + # Ground Truth + # Using broadcasting to create a mask that indexes action logits + action_index_mask = actions[..., None] == np.arange(num_actions) - if sess: - action_log_probs_tensor = sess.run(action_log_probs_tensor) - check(action_log_probs_tensor, ground_truth_v) + def index_with_mask(array, mask): + return array[mask].reshape(*array.shape[:-1]) + + # Note: Normally log(softmax) is not a good idea because it's not + # numerically stable. However, in this test we have well-behaved + # values. + ground_truth_v = index_with_mask( + np.log(softmax(policy_logits)), action_index_mask + ) + + check(action_log_probs_tensor, ground_truth_v) class VtraceTest(unittest.TestCase): @@ -188,15 +178,12 @@ def test_vtrace(self): "clip_pg_rho_threshold": 2.2, } - for fw, sess in framework_iterator(frameworks=("torch", "tf"), session=True): - vtrace = vtrace_tf if fw != "torch" else vtrace_torch - output = vtrace.from_importance_weights(**values) - if sess: - output = sess.run(output) + vtrace = vtrace_torch + output = vtrace.from_importance_weights(**values) - gt_vs, gt_pg_advantags = _ground_truth_vtrace_calculation(**values) - check(output.vs, gt_vs) - check(output.pg_advantages, gt_pg_advantags) + gt_vs, gt_pg_advantags = _ground_truth_vtrace_calculation(**values) + check(output.vs, gt_vs) + check(output.pg_advantages, gt_pg_advantags) def test_vtrace_from_logits(self): """Tests V-trace calculated from logits.""" @@ -225,172 +212,77 @@ def test_vtrace_from_logits(self): ) space_only_batch = Box(-1.0, 1.0, (batch_size,)) - for fw, sess in framework_iterator(frameworks=("torch", "tf"), session=True): - vtrace = vtrace_tf if fw != "torch" else vtrace_torch - - if fw == "tf": - # Intentionally leaving shapes unspecified to test if V-trace - # can deal with that. - inputs_ = { - # T, B, NUM_ACTIONS - "behaviour_policy_logits": tf1.placeholder( - dtype=tf.float32, shape=[None, None, None] - ), - # T, B, NUM_ACTIONS - "target_policy_logits": tf1.placeholder( - dtype=tf.float32, shape=[None, None, None] - ), - "actions": tf1.placeholder(dtype=tf.int32, shape=[None, None]), - "discounts": tf1.placeholder(dtype=tf.float32, shape=[None, None]), - "rewards": tf1.placeholder(dtype=tf.float32, shape=[None, None]), - "values": tf1.placeholder(dtype=tf.float32, shape=[None, None]), - "bootstrap_value": tf1.placeholder(dtype=tf.float32, shape=[None]), - } - else: - inputs_ = { - # T, B, NUM_ACTIONS - "behaviour_policy_logits": space.sample(), - # T, B, NUM_ACTIONS - "target_policy_logits": space.sample(), - "actions": action_space.sample(), - "discounts": space_w_time.sample(), - "rewards": space_w_time.sample(), - "values": space_w_time.sample(), - "bootstrap_value": space_only_batch.sample(), - } - from_logits_output = vtrace.from_logits( - clip_rho_threshold=clip_rho_threshold, - clip_pg_rho_threshold=clip_pg_rho_threshold, - **inputs_ - ) + inputs_ = { + # T, B, NUM_ACTIONS + "behaviour_policy_logits": space.sample(), + # T, B, NUM_ACTIONS + "target_policy_logits": space.sample(), + "actions": action_space.sample(), + "discounts": space_w_time.sample(), + "rewards": space_w_time.sample(), + "values": space_w_time.sample(), + "bootstrap_value": space_only_batch.sample(), + } + from_logits_output = vtrace_torch.from_logits( + clip_rho_threshold=clip_rho_threshold, + clip_pg_rho_threshold=clip_pg_rho_threshold, + **inputs_ + ) - if fw != "torch": - target_log_probs = vtrace.log_probs_from_logits_and_actions( - inputs_["target_policy_logits"], inputs_["actions"] - ) - behaviour_log_probs = vtrace.log_probs_from_logits_and_actions( - inputs_["behaviour_policy_logits"], inputs_["actions"] - ) - else: - target_log_probs = vtrace.log_probs_from_logits_and_actions( - torch.from_numpy(inputs_["target_policy_logits"]), - torch.from_numpy(inputs_["actions"]), - ) - behaviour_log_probs = vtrace.log_probs_from_logits_and_actions( - torch.from_numpy(inputs_["behaviour_policy_logits"]), - torch.from_numpy(inputs_["actions"]), - ) - log_rhos = target_log_probs - behaviour_log_probs - ground_truth = (log_rhos, behaviour_log_probs, target_log_probs) - - if sess: - values = { - "behaviour_policy_logits": space.sample(), - "target_policy_logits": space.sample(), - "actions": action_space.sample(), - "discounts": space_w_time.sample(), - "rewards": space_w_time.sample(), - "values": space_w_time.sample() / batch_size, - "bootstrap_value": space_only_batch.sample() + 1.0, - } - feed_dict = {inputs_[k]: v for k, v in values.items()} - from_logits_output = sess.run(from_logits_output, feed_dict=feed_dict) - log_rhos, behaviour_log_probs, target_log_probs = sess.run( - ground_truth, feed_dict=feed_dict - ) - - # Calculate V-trace using the ground truth logits. - from_iw = vtrace.from_importance_weights( - log_rhos=log_rhos, - discounts=values["discounts"], - rewards=values["rewards"], - values=values["values"], - bootstrap_value=values["bootstrap_value"], - clip_rho_threshold=clip_rho_threshold, - clip_pg_rho_threshold=clip_pg_rho_threshold, - ) - from_iw = sess.run(from_iw) - else: - from_iw = vtrace.from_importance_weights( - log_rhos=log_rhos, - discounts=inputs_["discounts"], - rewards=inputs_["rewards"], - values=inputs_["values"], - bootstrap_value=inputs_["bootstrap_value"], - clip_rho_threshold=clip_rho_threshold, - clip_pg_rho_threshold=clip_pg_rho_threshold, - ) - - check(from_iw.vs, from_logits_output.vs) - check(from_iw.pg_advantages, from_logits_output.pg_advantages) - check(behaviour_log_probs, from_logits_output.behaviour_action_log_probs) - check(target_log_probs, from_logits_output.target_action_log_probs) - check(log_rhos, from_logits_output.log_rhos) + target_log_probs = vtrace_torch.log_probs_from_logits_and_actions( + torch.from_numpy(inputs_["target_policy_logits"]), + torch.from_numpy(inputs_["actions"]), + ) + behaviour_log_probs = vtrace_torch.log_probs_from_logits_and_actions( + torch.from_numpy(inputs_["behaviour_policy_logits"]), + torch.from_numpy(inputs_["actions"]), + ) + log_rhos = target_log_probs - behaviour_log_probs + + from_iw = vtrace_torch.from_importance_weights( + log_rhos=log_rhos, + discounts=inputs_["discounts"], + rewards=inputs_["rewards"], + values=inputs_["values"], + bootstrap_value=inputs_["bootstrap_value"], + clip_rho_threshold=clip_rho_threshold, + clip_pg_rho_threshold=clip_pg_rho_threshold, + ) + + check(from_iw.vs, from_logits_output.vs) + check(from_iw.pg_advantages, from_logits_output.pg_advantages) + check(behaviour_log_probs, from_logits_output.behaviour_action_log_probs) + check(target_log_probs, from_logits_output.target_action_log_probs) + check(log_rhos, from_logits_output.log_rhos) def test_higher_rank_inputs_for_importance_weights(self): """Checks support for additional dimensions in inputs.""" - for fw in framework_iterator(frameworks=("torch", "tf"), session=True): - vtrace = vtrace_tf if fw != "torch" else vtrace_torch - if fw == "tf": - inputs_ = { - "log_rhos": tf1.placeholder( - dtype=tf.float32, shape=[None, None, 1] - ), - "discounts": tf1.placeholder( - dtype=tf.float32, shape=[None, None, 1] - ), - "rewards": tf1.placeholder( - dtype=tf.float32, shape=[None, None, 42] - ), - "values": tf1.placeholder(dtype=tf.float32, shape=[None, None, 42]), - "bootstrap_value": tf1.placeholder( - dtype=tf.float32, shape=[None, 42] - ), - } - else: - inputs_ = { - "log_rhos": Box(-1.0, 1.0, (8, 10, 1)).sample(), - "discounts": Box(-1.0, 1.0, (8, 10, 1)).sample(), - "rewards": Box(-1.0, 1.0, (8, 10, 42)).sample(), - "values": Box(-1.0, 1.0, (8, 10, 42)).sample(), - "bootstrap_value": Box(-1.0, 1.0, (10, 42)).sample(), - } - output = vtrace.from_importance_weights(**inputs_) - check(int(output.vs.shape[-1]), 42) + inputs_ = { + "log_rhos": Box(-1.0, 1.0, (8, 10, 1)).sample(), + "discounts": Box(-1.0, 1.0, (8, 10, 1)).sample(), + "rewards": Box(-1.0, 1.0, (8, 10, 42)).sample(), + "values": Box(-1.0, 1.0, (8, 10, 42)).sample(), + "bootstrap_value": Box(-1.0, 1.0, (10, 42)).sample(), + } + output = vtrace_torch.from_importance_weights(**inputs_) + check(int(output.vs.shape[-1]), 42) def test_inconsistent_rank_inputs_for_importance_weights(self): """Test one of many possible errors in shape of inputs.""" - for fw in framework_iterator(frameworks=("torch", "tf"), session=True): - vtrace = vtrace_tf if fw != "torch" else vtrace_torch - if fw == "tf": - inputs_ = { - "log_rhos": tf1.placeholder( - dtype=tf.float32, shape=[None, None, 1] - ), - "discounts": tf1.placeholder( - dtype=tf.float32, shape=[None, None, 1] - ), - "rewards": tf1.placeholder( - dtype=tf.float32, shape=[None, None, 42] - ), - "values": tf1.placeholder(dtype=tf.float32, shape=[None, None, 42]), - # Should be [None, 42]. - "bootstrap_value": tf1.placeholder(dtype=tf.float32, shape=[None]), - } - else: - inputs_ = { - "log_rhos": Box(-1.0, 1.0, (7, 15, 1)).sample(), - "discounts": Box(-1.0, 1.0, (7, 15, 1)).sample(), - "rewards": Box(-1.0, 1.0, (7, 15, 42)).sample(), - "values": Box(-1.0, 1.0, (7, 15, 42)).sample(), - # Should be [15, 42]. - "bootstrap_value": Box(-1.0, 1.0, (7,)).sample(), - } - with self.assertRaisesRegex( - (ValueError, AssertionError), "must have rank 2" - ): - vtrace.from_importance_weights(**inputs_) + inputs_ = { + "log_rhos": Box(-1.0, 1.0, (7, 15, 1)).sample(), + "discounts": Box(-1.0, 1.0, (7, 15, 1)).sample(), + "rewards": Box(-1.0, 1.0, (7, 15, 42)).sample(), + "values": Box(-1.0, 1.0, (7, 15, 42)).sample(), + # Should be [15, 42]. + "bootstrap_value": Box(-1.0, 1.0, (7,)).sample(), + } + with self.assertRaisesRegex((ValueError, AssertionError), "must have rank 2"): + vtrace_torch.from_importance_weights(**inputs_) if __name__ == "__main__": - tf.test.main() + import pytest + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/algorithms/marwil/tests/test_marwil.py b/rllib/algorithms/marwil/tests/test_marwil.py index 4fb3fb1da191..703674d7cd60 100644 --- a/rllib/algorithms/marwil/tests/test_marwil.py +++ b/rllib/algorithms/marwil/tests/test_marwil.py @@ -71,8 +71,6 @@ def test_marwil_compilation_and_learning_from_offline_file(self): num_iterations = 350 min_reward = 100.0 - # Test for all frameworks. - algo = config.build() learnt = False for i in range(num_iterations): diff --git a/rllib/algorithms/marwil/tests/test_marwil_old_api_stack.py b/rllib/algorithms/marwil/tests/test_marwil_old_api_stack.py index b43c5b0abd5f..bffcbe06db5f 100644 --- a/rllib/algorithms/marwil/tests/test_marwil_old_api_stack.py +++ b/rllib/algorithms/marwil/tests/test_marwil_old_api_stack.py @@ -5,7 +5,6 @@ import ray import ray.rllib.algorithms.marwil as marwil -from ray.rllib.algorithms.marwil.marwil_tf_policy import MARWILTF2Policy from ray.rllib.algorithms.marwil.marwil_torch_policy import MARWILTorchPolicy from ray.rllib.evaluation.postprocessing import compute_advantages from ray.rllib.offline import JsonReader @@ -19,7 +18,6 @@ check, check_compute_single_action, check_train_results, - framework_iterator, ) tf1, tf, tfv = try_import_tf() @@ -67,40 +65,35 @@ def test_marwil_compilation_and_learning_from_offline_file(self): num_iterations = 350 min_reward = 100.0 - # Test for all frameworks. - for _ in framework_iterator(config, frameworks=("torch", "tf")): - algo = config.build() - learnt = False - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - - eval_results = results.get(EVALUATION_RESULTS) - if eval_results: - print( - "iter={} R={} ".format( - i, eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] - ) + algo = config.build() + learnt = False + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + + eval_results = results.get(EVALUATION_RESULTS) + if eval_results: + print( + "iter={} R={} ".format( + i, eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] ) - # Learn until some reward is reached on an actual live env. - if ( - eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] - > min_reward - ): - print("learnt!") - learnt = True - break - - if not learnt: - raise ValueError( - "MARWILAlgorithm did not reach {} reward from expert " - "offline data!".format(min_reward) ) + # Learn until some reward is reached on an actual live env. + if eval_results[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] > min_reward: + print("learnt!") + learnt = True + break + + if not learnt: + raise ValueError( + "MARWILAlgorithm did not reach {} reward from expert " + "offline data!".format(min_reward) + ) - check_compute_single_action(algo, include_prev_action_reward=True) + check_compute_single_action(algo, include_prev_action_reward=True) - algo.stop() + algo.stop() def test_marwil_cont_actions_from_offline_file(self): """Test whether MARWIL runs with cont. actions. @@ -136,12 +129,10 @@ def test_marwil_cont_actions_from_offline_file(self): num_iterations = 3 - # Test for all frameworks. - for _ in framework_iterator(config, frameworks=("torch", "tf")): - algo = config.build(env="Pendulum-v1") - for i in range(num_iterations): - print(algo.train()) - algo.stop() + algo = config.build(env="Pendulum-v1") + for i in range(num_iterations): + print(algo.train()) + algo.stop() def test_marwil_loss_function(self): """ @@ -161,84 +152,48 @@ def test_marwil_loss_function(self): .offline_data(input_=[data_file]) ) # Learn from offline data. - for fw, sess in framework_iterator(config, session=True): - reader = JsonReader(inputs=[data_file]) - batch = reader.next() - - algo = config.build(env="CartPole-v1") - policy = algo.get_policy() - model = policy.model - - # Calculate our own expected values (to then compare against the - # agent's loss output). - cummulative_rewards = compute_advantages( - batch, 0.0, config.gamma, 1.0, False, False - )["advantages"] - if fw == "torch": - cummulative_rewards = torch.tensor(cummulative_rewards) - if fw != "tf": - batch = policy._lazy_tensor_dict(batch) - model_out, _ = model(batch) - vf_estimates = model.value_function() - if fw == "tf": - model_out, vf_estimates = policy.get_session().run( - [model_out, vf_estimates] - ) - adv = cummulative_rewards - vf_estimates - if fw == "torch": - adv = adv.detach().cpu().numpy() - adv_squared = np.mean(np.square(adv)) - c_2 = 100.0 + 1e-8 * (adv_squared - 100.0) - c = np.sqrt(c_2) - exp_advs = np.exp(config.beta * (adv / c)) - dist = policy.dist_class(model_out, model) - logp = dist.logp(batch["actions"]) - if fw == "torch": - logp = logp.detach().cpu().numpy() - elif fw == "tf": - logp = sess.run(logp) - # Calculate all expected loss components. - expected_vf_loss = 0.5 * adv_squared - expected_pol_loss = -1.0 * np.mean(exp_advs * logp) - expected_loss = expected_pol_loss + config.vf_coeff * expected_vf_loss - - # Calculate the algorithm's loss (to check against our own - # calculation above). - batch.set_get_interceptor(None) - postprocessed_batch = policy.postprocess_trajectory(batch) - loss_func = ( - MARWILTF2Policy.loss if fw != "torch" else MARWILTorchPolicy.loss - ) - if fw != "tf": - policy._lazy_tensor_dict(postprocessed_batch) - loss_out = loss_func( - policy, model, policy.dist_class, postprocessed_batch - ) - else: - loss_out, v_loss, p_loss = policy.get_session().run( - # policy._loss is create by TFPolicy, and is basically the - # loss tensor of the static graph. - [ - policy._loss, - policy._marwil_loss.v_loss, - policy._marwil_loss.p_loss, - ], - feed_dict=policy._get_loss_inputs_dict( - postprocessed_batch, shuffle=False - ), - ) - - # Check all components. - if fw == "torch": - check(policy.v_loss, expected_vf_loss, decimals=4) - check(policy.p_loss, expected_pol_loss, decimals=4) - elif fw == "tf": - check(v_loss, expected_vf_loss, decimals=4) - check(p_loss, expected_pol_loss, decimals=4) - else: - check(policy._marwil_loss.v_loss, expected_vf_loss, decimals=4) - check(policy._marwil_loss.p_loss, expected_pol_loss, decimals=4) - check(loss_out, expected_loss, decimals=3) + reader = JsonReader(inputs=[data_file]) + batch = reader.next() + + algo = config.build(env="CartPole-v1") + policy = algo.get_policy() + model = policy.model + + # Calculate our own expected values (to then compare against the + # agent's loss output). + cummulative_rewards = compute_advantages( + batch, 0.0, config.gamma, 1.0, False, False + )["advantages"] + cummulative_rewards = torch.tensor(cummulative_rewards) + batch = policy._lazy_tensor_dict(batch) + model_out, _ = model(batch) + vf_estimates = model.value_function() + adv = cummulative_rewards - vf_estimates + adv = adv.detach().cpu().numpy() + adv_squared = np.mean(np.square(adv)) + c_2 = 100.0 + 1e-8 * (adv_squared - 100.0) + c = np.sqrt(c_2) + exp_advs = np.exp(config.beta * (adv / c)) + dist = policy.dist_class(model_out, model) + logp = dist.logp(batch["actions"]) + logp = logp.detach().cpu().numpy() + # Calculate all expected loss components. + expected_vf_loss = 0.5 * adv_squared + expected_pol_loss = -1.0 * np.mean(exp_advs * logp) + expected_loss = expected_pol_loss + config.vf_coeff * expected_vf_loss + + # Calculate the algorithm's loss (to check against our own + # calculation above). + batch.set_get_interceptor(None) + postprocessed_batch = policy.postprocess_trajectory(batch) + loss_func = MARWILTorchPolicy.loss + policy._lazy_tensor_dict(postprocessed_batch) + loss_out = loss_func(policy, model, policy.dist_class, postprocessed_batch) + + # Check all components. + check(policy.v_loss, expected_vf_loss, decimals=4) + check(policy.p_loss, expected_pol_loss, decimals=4) + check(loss_out, expected_loss, decimals=3) if __name__ == "__main__": diff --git a/rllib/algorithms/ppo/tests/test_ppo.py b/rllib/algorithms/ppo/tests/test_ppo.py index 61c02521d24f..c99bc9c8feac 100644 --- a/rllib/algorithms/ppo/tests/test_ppo.py +++ b/rllib/algorithms/ppo/tests/test_ppo.py @@ -5,14 +5,12 @@ import ray from ray.rllib.algorithms.callbacks import DefaultCallbacks import ray.rllib.algorithms.ppo as ppo -from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF2Policy from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy from ray.rllib.core.columns import Columns from ray.rllib.evaluation.postprocessing import ( compute_gae_for_sample_batch, Postprocessing, ) -from ray.rllib.models.tf.tf_action_dist import Categorical from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 from ray.rllib.models.torch.torch_action_dist import TorchCategorical from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch @@ -23,7 +21,6 @@ check_compute_single_action, check_off_policyness, check_train_results, - framework_iterator, check_inference_w_connectors, ) @@ -159,39 +156,34 @@ def test_ppo_compilation_w_connectors(self): num_iterations = 2 - for fw in framework_iterator(config): - for env in ["FrozenLake-v1", "ALE/MsPacman-v5"]: - print("Env={}".format(env)) - for lstm in [False, True]: - print("LSTM={}".format(lstm)) - config.training( - model=dict( - use_lstm=lstm, - lstm_use_prev_action=lstm, - lstm_use_prev_reward=lstm, - ) + for env in ["FrozenLake-v1", "ALE/MsPacman-v5"]: + print("Env={}".format(env)) + for lstm in [False, True]: + print("LSTM={}".format(lstm)) + config.training( + model=dict( + use_lstm=lstm, + lstm_use_prev_action=lstm, + lstm_use_prev_reward=lstm, ) + ) - algo = config.build(env=env) - policy = algo.get_policy() - entropy_coeff = algo.get_policy().entropy_coeff - lr = policy.cur_lr - if fw == "tf": - entropy_coeff, lr = policy.get_session().run( - [entropy_coeff, lr] - ) - check(entropy_coeff, 0.1) - check(lr, config.lr) + algo = config.build(env=env) + policy = algo.get_policy() + entropy_coeff = algo.get_policy().entropy_coeff + lr = policy.cur_lr + check(entropy_coeff, 0.1) + check(lr, config.lr) - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) - algo.evaluate() + algo.evaluate() - check_inference_w_connectors(policy, env_name=env) - algo.stop() + check_inference_w_connectors(policy, env_name=env) + algo.stop() def test_ppo_compilation_and_schedule_mixins(self): """Test whether PPO can be built with all frameworks.""" @@ -225,45 +217,40 @@ def test_ppo_compilation_and_schedule_mixins(self): num_iterations = 2 - for fw in framework_iterator(config): - for env in ["FrozenLake-v1", "ALE/MsPacman-v5"]: - print("Env={}".format(env)) - for lstm in [False, True]: - print("LSTM={}".format(lstm)) - config.training( - model=dict( - use_lstm=lstm, - lstm_use_prev_action=lstm, - lstm_use_prev_reward=lstm, - ) + for env in ["FrozenLake-v1", "ALE/MsPacman-v5"]: + print("Env={}".format(env)) + for lstm in [False, True]: + print("LSTM={}".format(lstm)) + config.training( + model=dict( + use_lstm=lstm, + lstm_use_prev_action=lstm, + lstm_use_prev_reward=lstm, ) + ) - algo = config.build(env=env) - policy = algo.get_policy() - entropy_coeff = algo.get_policy().entropy_coeff - lr = policy.cur_lr - if fw == "tf": - entropy_coeff, lr = policy.get_session().run( - [entropy_coeff, lr] - ) - check(entropy_coeff, 0.1) - check(lr, config.lr) - - for i in range(num_iterations): - results = algo.train() - print(results) - check_train_results(results) - # 2 sgd iters per update, 2 minibatches per trainbatch -> 4x - # avg(0.0, 1.0, 2.0, 3.0) -> 1.5 - off_policy_ness = check_off_policyness( - results, lower_limit=1.5, upper_limit=1.5 - ) - print(f"off-policy'ness={off_policy_ness}") - - check_compute_single_action( - algo, include_prev_action_reward=True, include_state=lstm + algo = config.build(env=env) + policy = algo.get_policy() + entropy_coeff = algo.get_policy().entropy_coeff + lr = policy.cur_lr + check(entropy_coeff, 0.1) + check(lr, config.lr) + + for i in range(num_iterations): + results = algo.train() + print(results) + check_train_results(results) + # 2 sgd iters per update, 2 minibatches per trainbatch -> 4x + # avg(0.0, 1.0, 2.0, 3.0) -> 1.5 + off_policy_ness = check_off_policyness( + results, lower_limit=1.5, upper_limit=1.5 ) - algo.stop() + print(f"off-policy'ness={off_policy_ness}") + + check_compute_single_action( + algo, include_prev_action_reward=True, include_state=lstm + ) + algo.stop() def test_ppo_exploration_setup(self): """Tests, whether PPO runs with different exploration setups.""" @@ -280,34 +267,32 @@ def test_ppo_exploration_setup(self): ) obs = np.array(0) - # Test against all frameworks. - for fw, sess in framework_iterator(config, session=True): - # Default Agent should be setup with StochasticSampling. - algo = config.build() - # explore=False, always expect the same (deterministic) action. - a_ = algo.compute_single_action( - obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0) - ) + # Default Agent should be setup with StochasticSampling. + algo = config.build() + # explore=False, always expect the same (deterministic) action. + a_ = algo.compute_single_action( + obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0) + ) - for _ in range(50): - a = algo.compute_single_action( - obs, - explore=False, - prev_action=np.array(2), - prev_reward=np.array(1.0), - ) - check(a, a_) - - # With explore=True (default), expect stochastic actions. - actions = [] - for _ in range(300): - actions.append( - algo.compute_single_action( - obs, prev_action=np.array(2), prev_reward=np.array(1.0) - ) + for _ in range(50): + a = algo.compute_single_action( + obs, + explore=False, + prev_action=np.array(2), + prev_reward=np.array(1.0), + ) + check(a, a_) + + # With explore=True (default), expect stochastic actions. + actions = [] + for _ in range(300): + actions.append( + algo.compute_single_action( + obs, prev_action=np.array(2), prev_reward=np.array(1.0) ) - check(np.mean(actions), 1.5, atol=0.2) - algo.stop() + ) + check(np.mean(actions), 1.5, atol=0.2) + algo.stop() def test_ppo_free_log_std(self): """Tests the free log std option works. @@ -334,45 +319,31 @@ def test_ppo_free_log_std(self): ) ) - for fw, sess in framework_iterator(config, session=True): - algo = config.build() - policy = algo.get_policy() - - # Check the free log std var is created. - if fw == "torch": - matching = [ - v for (n, v) in policy.model.named_parameters() if "log_std" in n - ] - else: - matching = [ - v for v in policy.model.trainable_variables() if "log_std" in str(v) - ] - assert len(matching) == 1, matching - log_std_var = matching[0] - - # linter yells at you if you don't pass in the parameters. - # reason: https://docs.python-guide.org/writing/gotchas/ - # #late-binding-closures - def get_value(fw=fw, policy=policy, log_std_var=log_std_var): - if fw == "tf": - return policy.get_session().run(log_std_var)[0] - elif fw == "torch": - return log_std_var.detach().cpu().numpy()[0] - else: - return log_std_var.numpy()[0] - - # Check the variable is initially zero. - init_std = get_value() - assert init_std == 0.0, init_std - batch = compute_gae_for_sample_batch(policy, CARTPOLE_FAKE_BATCH.copy()) - if fw == "torch": - batch = policy._lazy_tensor_dict(batch) - policy.learn_on_batch(batch) - - # Check the variable is updated. - post_std = get_value() - assert post_std != 0.0, post_std - algo.stop() + algo = config.build() + policy = algo.get_policy() + + # Check the free log std var is created. + matching = [v for (n, v) in policy.model.named_parameters() if "log_std" in n] + assert len(matching) == 1, matching + log_std_var = matching[0] + + # linter yells at you if you don't pass in the parameters. + # reason: https://docs.python-guide.org/writing/gotchas/ + # #late-binding-closures + def get_value(fw="torch", policy=policy, log_std_var=log_std_var): + return log_std_var.detach().cpu().numpy()[0] + + # Check the variable is initially zero. + init_std = get_value() + assert init_std == 0.0, init_std + batch = compute_gae_for_sample_batch(policy, CARTPOLE_FAKE_BATCH.copy()) + batch = policy._lazy_tensor_dict(batch) + policy.learn_on_batch(batch) + + # Check the variable is updated. + post_std = get_value() + assert post_std != 0.0, post_std + algo.stop() def test_ppo_loss_function(self): """Tests the PPO loss function math. @@ -397,108 +368,62 @@ def test_ppo_loss_function(self): ) ) - for fw, sess in framework_iterator(config, session=True): - algo = config.build() - policy = algo.get_policy() - - # Check no free log std var by default. - if fw == "torch": - matching = [ - v for (n, v) in policy.model.named_parameters() if "log_std" in n - ] - else: - matching = [ - v for v in policy.model.trainable_variables() if "log_std" in str(v) - ] - assert len(matching) == 0, matching - - # Post-process (calculate simple (non-GAE) advantages) and attach - # to train_batch dict. - # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = - # [0.50005, -0.505, 0.5] - train_batch = compute_gae_for_sample_batch( - policy, CARTPOLE_FAKE_BATCH.copy() - ) - if fw == "torch": - train_batch = policy._lazy_tensor_dict(train_batch) - - # Check Advantage values. - check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) - - # Calculate actual PPO loss. - if fw == "tf2": - PPOTF2Policy.loss(policy, policy.model, Categorical, train_batch) - elif fw == "torch": - PPOTorchPolicy.loss( - policy, policy.model, policy.dist_class, train_batch - ) + algo = config.build() + policy = algo.get_policy() - vars = ( - policy.model.variables() - if fw != "torch" - else list(policy.model.parameters()) - ) - if fw == "tf": - vars = policy.get_session().run(vars) - expected_shared_out = fc( - train_batch[Columns.OBS], - vars[0 if fw != "torch" else 2], - vars[1 if fw != "torch" else 3], - framework=fw, - ) - expected_logits = fc( - expected_shared_out, - vars[2 if fw != "torch" else 0], - vars[3 if fw != "torch" else 1], - framework=fw, - ) - expected_value_outs = fc( - expected_shared_out, vars[4], vars[5], framework=fw - ) + # Check no free log std var by default. + matching = [v for (n, v) in policy.model.named_parameters() if "log_std" in n] + assert len(matching) == 0, matching - kl, entropy, pg_loss, vf_loss, overall_loss = self._ppo_loss_helper( - policy, - policy.model, - Categorical if fw != "torch" else TorchCategorical, - train_batch, - expected_logits, - expected_value_outs, - sess=sess, - ) - if sess: - policy_sess = policy.get_session() - k, e, pl, v, tl = policy_sess.run( - [ - policy._mean_kl_loss, - policy._mean_entropy, - policy._mean_policy_loss, - policy._mean_vf_loss, - policy._total_loss, - ], - feed_dict=policy._get_loss_inputs_dict(train_batch, shuffle=False), - ) - check(k, kl) - check(e, entropy) - check(pl, np.mean(-pg_loss)) - check(v, np.mean(vf_loss), decimals=4) - check(tl, overall_loss, decimals=4) - elif fw == "torch": - check(policy.model.tower_stats["mean_kl_loss"], kl) - check(policy.model.tower_stats["mean_entropy"], entropy) - check(policy.model.tower_stats["mean_policy_loss"], np.mean(-pg_loss)) - check( - policy.model.tower_stats["mean_vf_loss"], - np.mean(vf_loss), - decimals=4, - ) - check(policy.model.tower_stats["total_loss"], overall_loss, decimals=4) - else: - check(policy._mean_kl_loss, kl) - check(policy._mean_entropy, entropy) - check(policy._mean_policy_loss, np.mean(-pg_loss)) - check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4) - check(policy._total_loss, overall_loss, decimals=4) - algo.stop() + # Post-process (calculate simple (non-GAE) advantages) and attach + # to train_batch dict. + # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = + # [0.50005, -0.505, 0.5] + train_batch = compute_gae_for_sample_batch(policy, CARTPOLE_FAKE_BATCH.copy()) + train_batch = policy._lazy_tensor_dict(train_batch) + + # Check Advantage values. + check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) + + # Calculate actual PPO loss. + PPOTorchPolicy.loss(policy, policy.model, policy.dist_class, train_batch) + + vars = list(policy.model.parameters()) + expected_shared_out = fc( + train_batch[Columns.OBS], + vars[2], + vars[3], + framework="torch", + ) + expected_logits = fc( + expected_shared_out, + vars[0], + vars[1], + framework="torch", + ) + expected_value_outs = fc( + expected_shared_out, vars[4], vars[5], framework="torch" + ) + + kl, entropy, pg_loss, vf_loss, overall_loss = self._ppo_loss_helper( + policy, + policy.model, + TorchCategorical, + train_batch, + expected_logits, + expected_value_outs, + sess=None, + ) + check(policy.model.tower_stats["mean_kl_loss"], kl) + check(policy.model.tower_stats["mean_entropy"], entropy) + check(policy.model.tower_stats["mean_policy_loss"], np.mean(-pg_loss)) + check( + policy.model.tower_stats["mean_vf_loss"], + np.mean(vf_loss), + decimals=4, + ) + check(policy.model.tower_stats["total_loss"], overall_loss, decimals=4) + algo.stop() def _ppo_loss_helper( self, policy, model, dist_class, train_batch, logits, vf_outs, sess=None diff --git a/rllib/algorithms/ppo/tests/test_ppo_learner.py b/rllib/algorithms/ppo/tests/test_ppo_learner.py index 28242452b1ed..809875fcfb49 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_learner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_learner.py @@ -3,7 +3,6 @@ import gymnasium as gym import numpy as np -import tensorflow as tf import torch import tree # pip install dm-tree @@ -15,7 +14,7 @@ from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.metrics.learner_info import LEARNER_INFO -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check from ray.tune.registry import register_env @@ -69,34 +68,28 @@ def test_loss(self): ) ) - for fw in framework_iterator(config, ("tf2", "torch")): - algo = config.build() - policy = algo.get_policy() + algo = config.build() + policy = algo.get_policy() - train_batch = SampleBatch(FAKE_BATCH) - train_batch = compute_gae_for_sample_batch(policy, train_batch) + train_batch = SampleBatch(FAKE_BATCH) + train_batch = compute_gae_for_sample_batch(policy, train_batch) - # convert to proper tensors with tree.map_structure - if fw == "torch": - train_batch = tree.map_structure( - lambda x: torch.as_tensor(x).float(), train_batch - ) - else: - train_batch = tree.map_structure( - lambda x: tf.convert_to_tensor(x), train_batch - ) + # Convert to proper tensors with tree.map_structure. + train_batch = tree.map_structure( + lambda x: torch.as_tensor(x).float(), train_batch + ) - algo_config = config.copy(copy_frozen=False) - algo_config.validate() - algo_config.freeze() + algo_config = config.copy(copy_frozen=False) + algo_config.validate() + algo_config.freeze() - learner_group = algo_config.build_learner_group(env=self.ENV) + learner_group = algo_config.build_learner_group(env=self.ENV) - # Load the algo weights onto the learner_group. - learner_group.set_weights(algo.get_weights()) - learner_group.update_from_batch(batch=train_batch.as_multi_agent()) + # Load the algo weights onto the learner_group. + learner_group.set_weights(algo.get_weights()) + learner_group.update_from_batch(batch=train_batch.as_multi_agent()) - algo.stop() + algo.stop() def test_save_to_path_and_restore_from_path(self): """Tests saving and loading the state of the PPO Learner Group.""" @@ -117,19 +110,18 @@ def test_save_to_path_and_restore_from_path(self): ) ) - for _ in framework_iterator(config, ("tf2", "torch")): - algo_config = config.copy(copy_frozen=False) - algo_config.validate() - algo_config.freeze() - learner_group1 = algo_config.build_learner_group(env=self.ENV) - learner_group2 = algo_config.build_learner_group(env=self.ENV) - with tempfile.TemporaryDirectory() as tmpdir: - learner_group1.save_to_path(tmpdir) - learner_group2.restore_from_path(tmpdir) - # Remove functions from state b/c they are not comparable via `check`. - s1 = learner_group1.get_state() - s2 = learner_group2.get_state() - check(s1, s2) + algo_config = config.copy(copy_frozen=False) + algo_config.validate() + algo_config.freeze() + learner_group1 = algo_config.build_learner_group(env=self.ENV) + learner_group2 = algo_config.build_learner_group(env=self.ENV) + with tempfile.TemporaryDirectory() as tmpdir: + learner_group1.save_to_path(tmpdir) + learner_group2.restore_from_path(tmpdir) + # Remove functions from state b/c they are not comparable via `check`. + s1 = learner_group1.get_state() + s2 = learner_group2.get_state() + check(s1, s2) def test_kl_coeff_changes(self): # Simple environment with 4 independent cartpole entities @@ -165,29 +157,28 @@ def test_kl_coeff_changes(self): ) ) - for _ in framework_iterator(config, ("torch", "tf2")): - algo = config.build() - # Call train while results aren't returned because this is - # a asynchronous Algorithm and results are returned asynchronously. - curr_kl_coeff_1 = None - curr_kl_coeff_2 = None - while not curr_kl_coeff_1 or not curr_kl_coeff_2: - results = algo.train() - - # Attempt to get the current KL coefficient from the learner. - # Iterate until we have found both coefficients at least once. - if results and "info" in results and LEARNER_INFO in results["info"]: - if "p0" in results["info"][LEARNER_INFO]: - curr_kl_coeff_1 = results["info"][LEARNER_INFO]["p0"][ - LEARNER_RESULTS_CURR_KL_COEFF_KEY - ] - if "p1" in results["info"][LEARNER_INFO]: - curr_kl_coeff_2 = results["info"][LEARNER_INFO]["p1"][ - LEARNER_RESULTS_CURR_KL_COEFF_KEY - ] - - self.assertNotEqual(curr_kl_coeff_1, initial_kl_coeff) - self.assertNotEqual(curr_kl_coeff_2, initial_kl_coeff) + algo = config.build() + # Call train while results aren't returned because this is + # a asynchronous Algorithm and results are returned asynchronously. + curr_kl_coeff_1 = None + curr_kl_coeff_2 = None + while not curr_kl_coeff_1 or not curr_kl_coeff_2: + results = algo.train() + + # Attempt to get the current KL coefficient from the learner. + # Iterate until we have found both coefficients at least once. + if results and "info" in results and LEARNER_INFO in results["info"]: + if "p0" in results["info"][LEARNER_INFO]: + curr_kl_coeff_1 = results["info"][LEARNER_INFO]["p0"][ + LEARNER_RESULTS_CURR_KL_COEFF_KEY + ] + if "p1" in results["info"][LEARNER_INFO]: + curr_kl_coeff_2 = results["info"][LEARNER_INFO]["p1"][ + LEARNER_RESULTS_CURR_KL_COEFF_KEY + ] + + self.assertNotEqual(curr_kl_coeff_1, initial_kl_coeff) + self.assertNotEqual(curr_kl_coeff_2, initial_kl_coeff) if __name__ == "__main__": diff --git a/rllib/algorithms/sac/tests/test_rnnsac.py b/rllib/algorithms/sac/tests/test_rnnsac.py index e0fec4b5d7ee..704be42d1773 100644 --- a/rllib/algorithms/sac/tests/test_rnnsac.py +++ b/rllib/algorithms/sac/tests/test_rnnsac.py @@ -2,10 +2,9 @@ import ray from ray.rllib.algorithms import sac -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import check_compute_single_action, framework_iterator +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import check_compute_single_action -tf1, tf, tfv = try_import_tf() torch, nn = try_import_torch() @@ -53,17 +52,16 @@ def test_rnnsac_compilation(self): num_iterations = 1 # Test building an RNNSAC agent in all frameworks. - for _ in framework_iterator(config, frameworks="torch"): - algo = config.build() - for i in range(num_iterations): - results = algo.train() - print(results) + algo = config.build() + for i in range(num_iterations): + results = algo.train() + print(results) - check_compute_single_action( - algo, - include_state=True, - include_prev_action_reward=True, - ) + check_compute_single_action( + algo, + include_state=True, + include_prev_action_reward=True, + ) if __name__ == "__main__": diff --git a/rllib/algorithms/sac/tests/test_sac.py b/rllib/algorithms/sac/tests/test_sac.py index 1425fb6d93e7..ec9b7a4415a9 100644 --- a/rllib/algorithms/sac/tests/test_sac.py +++ b/rllib/algorithms/sac/tests/test_sac.py @@ -22,7 +22,6 @@ check, check_compute_single_action, check_train_results, - framework_iterator, ) from ray.rllib.utils.torch_utils import convert_to_torch_tensor from ray import tune @@ -119,47 +118,44 @@ def test_sac_compilation(self): ), ) - for fw in framework_iterator(config): - # Test for different env types (discrete w/ and w/o image, + cont). - for env in [ - "random_dict_env", - "random_tuple_env", - "CartPole-v1", - ]: - print("Env={}".format(env)) - config.environment(env) - # Test making the Q-model a custom one for CartPole, otherwise, - # use the default model. - config.q_model_config["custom_model"] = ( - "batch_norm{}".format("_torch" if fw == "torch" else "") - if env == "CartPole-v1" - else None - ) - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) - - # Test, whether the replay buffer is saved along with - # a checkpoint (no point in doing it for all frameworks since - # this is framework agnostic). - if fw == "tf" and env == "CartPole-v1": - checkpoint = algo.save() - new_algo = config.build() - new_algo.restore(checkpoint) - # Get some data from the buffer and compare. - data = algo.local_replay_buffer.replay_buffers[ - "default_policy" - ]._storage[: 42 + 42] - new_data = new_algo.local_replay_buffer.replay_buffers[ - "default_policy" - ]._storage[: 42 + 42] - check(data, new_data) - new_algo.stop() - - algo.stop() + # Test for different env types (discrete w/ and w/o image, + cont). + for env in [ + "random_dict_env", + "random_tuple_env", + "CartPole-v1", + ]: + print("Env={}".format(env)) + config.environment(env) + # Test making the Q-model a custom one for CartPole, otherwise, + # use the default model. + config.q_model_config["custom_model"] = ( + "batch_norm{}".format("_torch") if env == "CartPole-v1" else None + ) + algo = config.build() + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + check_compute_single_action(algo) + + # Test, whether the replay buffer is saved along with + # a checkpoint (no point in doing it for all frameworks since + # this is framework agnostic). + if env == "CartPole-v1": + checkpoint = algo.save() + new_algo = config.build() + new_algo.restore(checkpoint) + # Get some data from the buffer and compare. + data = algo.local_replay_buffer.replay_buffers[ + "default_policy" + ]._storage[: 42 + 42] + new_data = new_algo.local_replay_buffer.replay_buffers[ + "default_policy" + ]._storage[: 42 + 42] + check(data, new_data) + new_algo.stop() + + algo.stop() def test_sac_dict_obs_order(self): dict_space = Dict( @@ -210,13 +206,12 @@ def step(self, action): ) num_iterations = 1 - for _ in framework_iterator(config): - algo = config.build() - for _ in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) + algo = config.build() + for _ in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + check_compute_single_action(algo) def _get_batch_helper(self, obs_size, actions, batch_size): return SampleBatch( diff --git a/rllib/algorithms/tests/test_algorithm.py b/rllib/algorithms/tests/test_algorithm.py index ffe45ea858b3..97b1cda0c9fe 100644 --- a/rllib/algorithms/tests/test_algorithm.py +++ b/rllib/algorithms/tests/test_algorithm.py @@ -26,7 +26,7 @@ LEARNER_RESULTS, ) from ray.rllib.utils.metrics.learner_info import LEARNER_INFO -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check from ray.tune import register_env @@ -248,174 +248,166 @@ def test_add_policy_and_remove_policy(self): obs_space = gym.spaces.Box(-2.0, 2.0, (4,)) act_space = gym.spaces.Discrete(2) - for fw in framework_iterator(config): - # Pre-generate a policy instance to test adding these directly to an - # existing algorithm. - if fw == "tf": - policy_obj = ppo.PPOTF1Policy(obs_space, act_space, config.to_dict()) - elif fw == "tf2": - policy_obj = ppo.PPOTF2Policy(obs_space, act_space, config.to_dict()) + # Pre-generate a policy instance to test adding these directly to an + # existing algorithm. + policy_obj = ppo.PPOTorchPolicy(obs_space, act_space, config.to_dict()) + + # Construct the Algorithm with a single policy in it. + algo = config.build() + pol0 = algo.get_policy("p0") + r = algo.train() + self.assertTrue("p0" in r["info"][LEARNER_INFO]) + for i in range(1, 3): + + def new_mapping_fn(agent_id, episode, worker, i=i, **kwargs): + return f"p{choice([i, i - 1])}" + + # Add a new policy either by class (and options) or by instance. + pid = f"p{i}" + print(f"Adding policy {pid} ...") + # By (already instantiated) instance. + if i == 2: + new_pol = algo.add_policy( + pid, + # Pass in an already existing policy instance. + policy=policy_obj, + # Test changing the mapping fn. + policy_mapping_fn=new_mapping_fn, + # Change the list of policies to train. + policies_to_train=[f"p{i}", f"p{i - 1}"], + ) + # By class (and options). else: - policy_obj = ppo.PPOTorchPolicy(obs_space, act_space, config.to_dict()) - - # Construct the Algorithm with a single policy in it. - algo = config.build() - pol0 = algo.get_policy("p0") - r = algo.train() - self.assertTrue("p0" in r["info"][LEARNER_INFO]) - for i in range(1, 3): - - def new_mapping_fn(agent_id, episode, worker, i=i, **kwargs): - return f"p{choice([i, i - 1])}" - - # Add a new policy either by class (and options) or by instance. - pid = f"p{i}" - print(f"Adding policy {pid} ...") - # By (already instantiated) instance. - if i == 2: - new_pol = algo.add_policy( - pid, - # Pass in an already existing policy instance. - policy=policy_obj, - # Test changing the mapping fn. - policy_mapping_fn=new_mapping_fn, - # Change the list of policies to train. - policies_to_train=[f"p{i}", f"p{i - 1}"], - ) - # By class (and options). - else: - new_pol = algo.add_policy( - pid, - algo.get_default_policy_class(config), - observation_space=obs_space, - action_space=act_space, - # Test changing the mapping fn. - policy_mapping_fn=new_mapping_fn, - # Change the list of policies to train. - policies_to_train=[f"p{i}", f"p{i-1}"], - ) + new_pol = algo.add_policy( + pid, + algo.get_default_policy_class(config), + observation_space=obs_space, + action_space=act_space, + # Test changing the mapping fn. + policy_mapping_fn=new_mapping_fn, + # Change the list of policies to train. + policies_to_train=[f"p{i}", f"p{i-1}"], + ) - # Make sure new policy is part of remote workers in the - # worker set and the eval worker set. - self.assertTrue( - all( - algo.env_runner_group.foreach_worker( - func=lambda w, pid=pid: pid in w.policy_map - ) + # Make sure new policy is part of remote workers in the + # worker set and the eval worker set. + self.assertTrue( + all( + algo.env_runner_group.foreach_worker( + func=lambda w, pid=pid: pid in w.policy_map ) ) - self.assertTrue( - all( - algo.eval_env_runner_group.foreach_worker( - func=lambda w, pid=pid: pid in w.policy_map - ) + ) + self.assertTrue( + all( + algo.eval_env_runner_group.foreach_worker( + func=lambda w, pid=pid: pid in w.policy_map ) ) + ) - # Assert new policy is part of local worker (eval worker set does NOT - # have a local worker, only the main EnvRunnerGroup does). - pol_map = algo.env_runner.policy_map - self.assertTrue(new_pol is not pol0) - for j in range(i + 1): - self.assertTrue(f"p{j}" in pol_map) - self.assertTrue(len(pol_map) == i + 1) - algo.train() - checkpoint = algo.save().checkpoint - - # Test restoring from the checkpoint (which has more policies - # than what's defined in the config dict). - test = ppo.PPO.from_checkpoint(checkpoint=checkpoint) - - # Make sure evaluation worker also got the restored, added policy. - def _has_policies(w, pid=pid): - return ( - w.get_policy("p0") is not None and w.get_policy(pid) is not None - ) + # Assert new policy is part of local worker (eval worker set does NOT + # have a local worker, only the main EnvRunnerGroup does). + pol_map = algo.env_runner.policy_map + self.assertTrue(new_pol is not pol0) + for j in range(i + 1): + self.assertTrue(f"p{j}" in pol_map) + self.assertTrue(len(pol_map) == i + 1) + algo.train() + checkpoint = algo.save().checkpoint - self.assertTrue( - all(test.eval_env_runner_group.foreach_worker(_has_policies)) - ) + # Test restoring from the checkpoint (which has more policies + # than what's defined in the config dict). + test = ppo.PPO.from_checkpoint(checkpoint=checkpoint) - # Make sure algorithm can continue training the restored policy. - pol0 = test.get_policy("p0") - test.train() - # Test creating an action with the added (and restored) policy. - a = test.compute_single_action( - np.zeros_like(pol0.observation_space.sample()), policy_id=pid - ) - self.assertTrue(pol0.action_space.contains(a)) - test.stop() - - # After having added 2 policies, try to restore the Algorithm, - # but only with 1 of the originally added policies (plus the initial - # p0). - if i == 2: - - def new_mapping_fn(agent_id, episode, worker, **kwargs): - return f"p{choice([0, 2])}" - - test2 = ppo.PPO.from_checkpoint( - path=checkpoint, - policy_ids=["p0", "p2"], - policy_mapping_fn=new_mapping_fn, - policies_to_train=["p0"], - ) + # Make sure evaluation worker also got the restored, added policy. + def _has_policies(w, pid=pid): + return w.get_policy("p0") is not None and w.get_policy(pid) is not None - # Make sure evaluation workers have the same policies. - def _has_policies(w): - return ( - w.get_policy("p0") is not None - and w.get_policy("p2") is not None - and w.get_policy("p1") is None - ) + self.assertTrue( + all(test.eval_env_runner_group.foreach_worker(_has_policies)) + ) - self.assertTrue( - all(test2.eval_env_runner_group.foreach_worker(_has_policies)) - ) + # Make sure algorithm can continue training the restored policy. + pol0 = test.get_policy("p0") + test.train() + # Test creating an action with the added (and restored) policy. + a = test.compute_single_action( + np.zeros_like(pol0.observation_space.sample()), policy_id=pid + ) + self.assertTrue(pol0.action_space.contains(a)) + test.stop() - # Make sure algorithm can continue training the restored policy. - pol2 = test2.get_policy("p2") - test2.train() - # Test creating an action with the added (and restored) policy. - a = test2.compute_single_action( - np.zeros_like(pol2.observation_space.sample()), policy_id=pid - ) - self.assertTrue(pol2.action_space.contains(a)) - test2.stop() + # After having added 2 policies, try to restore the Algorithm, + # but only with 1 of the originally added policies (plus the initial + # p0). + if i == 2: - # Delete all added policies again from Algorithm. - for i in range(2, 0, -1): - pid = f"p{i}" - algo.remove_policy( - pid, - # Note that the complete signature of a policy_mapping_fn - # is: `agent_id, episode, worker, **kwargs`. - policy_mapping_fn=( - lambda agent_id, episode, worker, i=i, **kwargs: f"p{i - 1}" - ), - # Update list of policies to train. - policies_to_train=[f"p{i - 1}"], + def new_mapping_fn(agent_id, episode, worker, **kwargs): + return f"p{choice([0, 2])}" + + test2 = ppo.PPO.from_checkpoint( + path=checkpoint, + policy_ids=["p0", "p2"], + policy_mapping_fn=new_mapping_fn, + policies_to_train=["p0"], ) - # Make sure removed policy is no longer part of remote workers in the - # worker set and the eval worker set. + + # Make sure evaluation workers have the same policies. + def _has_policies(w): + return ( + w.get_policy("p0") is not None + and w.get_policy("p2") is not None + and w.get_policy("p1") is None + ) + self.assertTrue( - algo.env_runner_group.foreach_worker( - func=lambda w, pid=pid: pid not in w.policy_map - )[0] + all(test2.eval_env_runner_group.foreach_worker(_has_policies)) ) - self.assertTrue( - algo.eval_env_runner_group.foreach_worker( - func=lambda w, pid=pid: pid not in w.policy_map - )[0] + + # Make sure algorithm can continue training the restored policy. + pol2 = test2.get_policy("p2") + test2.train() + # Test creating an action with the added (and restored) policy. + a = test2.compute_single_action( + np.zeros_like(pol2.observation_space.sample()), policy_id=pid ) - # Assert removed policy is no longer part of local worker - # (eval worker set does NOT have a local worker, only the main - # EnvRunnerGroup does). - pol_map = algo.env_runner.policy_map - self.assertTrue(pid not in pol_map) - self.assertTrue(len(pol_map) == i) + self.assertTrue(pol2.action_space.contains(a)) + test2.stop() - algo.stop() + # Delete all added policies again from Algorithm. + for i in range(2, 0, -1): + pid = f"p{i}" + algo.remove_policy( + pid, + # Note that the complete signature of a policy_mapping_fn + # is: `agent_id, episode, worker, **kwargs`. + policy_mapping_fn=( + lambda agent_id, episode, worker, i=i, **kwargs: f"p{i - 1}" + ), + # Update list of policies to train. + policies_to_train=[f"p{i - 1}"], + ) + # Make sure removed policy is no longer part of remote workers in the + # worker set and the eval worker set. + self.assertTrue( + algo.env_runner_group.foreach_worker( + func=lambda w, pid=pid: pid not in w.policy_map + )[0] + ) + self.assertTrue( + algo.eval_env_runner_group.foreach_worker( + func=lambda w, pid=pid: pid not in w.policy_map + )[0] + ) + # Assert removed policy is no longer part of local worker + # (eval worker set does NOT have a local worker, only the main + # EnvRunnerGroup does). + pol_map = algo.env_runner.policy_map + self.assertTrue(pid not in pol_map) + self.assertTrue(len(pol_map) == i) + + algo.stop() def test_evaluation_option(self): # Use a custom callback that asserts that we are running the @@ -432,29 +424,28 @@ def test_evaluation_option(self): .callbacks(callbacks_class=AssertEvalCallback) ) - for _ in framework_iterator(config, frameworks=("tf", "torch")): - algo = config.build() - # Given evaluation_interval=2, r0, r2, r4 should not contain - # evaluation metrics, while r1, r3 should. - r0 = algo.train() - print(r0) - r1 = algo.train() - print(r1) - r2 = algo.train() - print(r2) - r3 = algo.train() - print(r3) - algo.stop() - - self.assertFalse(EVALUATION_RESULTS in r0) - self.assertTrue(EVALUATION_RESULTS in r1) - self.assertFalse(EVALUATION_RESULTS in r2) - self.assertTrue(EVALUATION_RESULTS in r3) - self.assertTrue(ENV_RUNNER_RESULTS in r1[EVALUATION_RESULTS]) - self.assertTrue( - EPISODE_RETURN_MEAN in r1[EVALUATION_RESULTS][ENV_RUNNER_RESULTS] - ) - self.assertNotEqual(r1[EVALUATION_RESULTS], r3[EVALUATION_RESULTS]) + algo = config.build() + # Given evaluation_interval=2, r0, r2, r4 should not contain + # evaluation metrics, while r1, r3 should. + r0 = algo.train() + print(r0) + r1 = algo.train() + print(r1) + r2 = algo.train() + print(r2) + r3 = algo.train() + print(r3) + algo.stop() + + self.assertFalse(EVALUATION_RESULTS in r0) + self.assertTrue(EVALUATION_RESULTS in r1) + self.assertFalse(EVALUATION_RESULTS in r2) + self.assertTrue(EVALUATION_RESULTS in r3) + self.assertTrue(ENV_RUNNER_RESULTS in r1[EVALUATION_RESULTS]) + self.assertTrue( + EPISODE_RETURN_MEAN in r1[EVALUATION_RESULTS][ENV_RUNNER_RESULTS] + ) + self.assertNotEqual(r1[EVALUATION_RESULTS], r3[EVALUATION_RESULTS]) def test_evaluation_option_always_attach_eval_metrics(self): # Use a custom callback that asserts that we are running the @@ -471,22 +462,21 @@ def test_evaluation_option_always_attach_eval_metrics(self): .reporting(min_sample_timesteps_per_iteration=100) .callbacks(callbacks_class=AssertEvalCallback) ) - for _ in framework_iterator(config, frameworks=("torch", "tf")): - algo = config.build() - # Should only see eval results, when eval actually ran. - r0 = algo.train() - r1 = algo.train() - r2 = algo.train() - r3 = algo.train() - algo.stop() - - # Eval results are not available at step 0. - # But step 3 should still have it, even though no eval was - # run during that step. - self.assertTrue(EVALUATION_RESULTS not in r0) - self.assertTrue(EVALUATION_RESULTS in r1) - self.assertTrue(EVALUATION_RESULTS not in r2) - self.assertTrue(EVALUATION_RESULTS in r3) + algo = config.build() + # Should only see eval results, when eval actually ran. + r0 = algo.train() + r1 = algo.train() + r2 = algo.train() + r3 = algo.train() + algo.stop() + + # Eval results are not available at step 0. + # But step 3 should still have it, even though no eval was + # run during that step. + self.assertTrue(EVALUATION_RESULTS not in r0) + self.assertTrue(EVALUATION_RESULTS in r1) + self.assertTrue(EVALUATION_RESULTS not in r2) + self.assertTrue(EVALUATION_RESULTS in r3) def test_evaluation_wo_evaluation_env_runner_group(self): # Use a custom callback that asserts that we are running the @@ -497,30 +487,29 @@ def test_evaluation_wo_evaluation_env_runner_group(self): .callbacks(callbacks_class=AssertEvalCallback) ) - for _ in framework_iterator(frameworks=("torch", "tf")): - # Setup algorithm w/o evaluation worker set and still call - # evaluate() -> Expect error. - algo_wo_env_on_local_worker = config.build() - self.assertRaisesRegex( - ValueError, - "Can't evaluate on a local worker", - algo_wo_env_on_local_worker.evaluate, - ) - algo_wo_env_on_local_worker.stop() - - # Try again using `create_env_on_driver=True`. - # This force-adds the env on the local-worker, so this Algorithm - # can `evaluate` even though it doesn't have an evaluation-worker - # set. - config.create_env_on_local_worker = True - algo_w_env_on_local_worker = config.build() - results = algo_w_env_on_local_worker.evaluate() - assert ( - ENV_RUNNER_RESULTS in results - and EPISODE_RETURN_MEAN in results[ENV_RUNNER_RESULTS] - ) - algo_w_env_on_local_worker.stop() - config.create_env_on_local_worker = False + # Setup algorithm w/o evaluation worker set and still call + # evaluate() -> Expect error. + algo_wo_env_on_local_worker = config.build() + self.assertRaisesRegex( + ValueError, + "Can't evaluate on a local worker", + algo_wo_env_on_local_worker.evaluate, + ) + algo_wo_env_on_local_worker.stop() + + # Try again using `create_env_on_driver=True`. + # This force-adds the env on the local-worker, so this Algorithm + # can `evaluate` even though it doesn't have an evaluation-worker + # set. + config.create_env_on_local_worker = True + algo_w_env_on_local_worker = config.build() + results = algo_w_env_on_local_worker.evaluate() + assert ( + ENV_RUNNER_RESULTS in results + and EPISODE_RETURN_MEAN in results[ENV_RUNNER_RESULTS] + ) + algo_w_env_on_local_worker.stop() + config.create_env_on_local_worker = False def test_space_inference_from_remote_workers(self): # Expect to not do space inference if the learner has an env. diff --git a/rllib/algorithms/tests/test_algorithm_export_checkpoint.py b/rllib/algorithms/tests/test_algorithm_export_checkpoint.py index d5ddec5c79f7..9c64f160a5d8 100644 --- a/rllib/algorithms/tests/test_algorithm_export_checkpoint.py +++ b/rllib/algorithms/tests/test_algorithm_export_checkpoint.py @@ -6,11 +6,9 @@ import ray from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import framework_iterator +from ray.rllib.utils.framework import try_import_torch from ray.tune.registry import get_trainable_cls -tf1, tf, tfv = try_import_tf() torch, _ = try_import_torch() # Keep a set of all RLlib algos that support the RLModule API. @@ -66,27 +64,18 @@ def save_test(alg_name, framework="tf", multi_agent=False): ) # Test loading exported model and perform forward pass. - if framework == "torch": - filename = os.path.join(model_dir, "model.pt") - model = torch.load(filename) - assert model - results = model( - input_dict={"obs": torch.from_numpy(test_obs)}, - # TODO (sven): Make non-RNN models NOT expect these args at all. - state=[torch.tensor(0)], # dummy value - seq_lens=torch.tensor(0), # dummy value - ) - assert len(results) == 2 - assert results[0].shape == (1, 2) - assert results[1] == [torch.tensor(0)] # dummy - else: - model = tf.saved_model.load(model_dir) - assert model - results = model(tf.convert_to_tensor(test_obs, dtype=tf.float32)) - assert len(results) == 2 - assert results[0].shape == (1, 2) - # TODO (sven): Make non-RNN models NOT return states (empty list). - assert results[1].shape == (1, 1) # dummy state-out + filename = os.path.join(model_dir, "model.pt") + model = torch.load(filename) + assert model + results = model( + input_dict={"obs": torch.from_numpy(test_obs)}, + # TODO (sven): Make non-RNN models NOT expect these args at all. + state=[torch.tensor(0)], # dummy value + seq_lens=torch.tensor(0), # dummy value + ) + assert len(results) == 2 + assert results[0].shape == (1, 2) + assert results[1] == [torch.tensor(0)] # dummy shutil.rmtree(export_dir) @@ -101,12 +90,10 @@ def tearDownClass(cls) -> None: ray.shutdown() def test_save_appo_multi_agent(self): - for fw in framework_iterator(): - save_test("APPO", fw, multi_agent=True) + save_test("APPO", "torch", multi_agent=True) def test_save_ppo(self): - for fw in framework_iterator(): - save_test("PPO", fw) + save_test("PPO", "torch") if __name__ == "__main__": diff --git a/rllib/algorithms/tests/test_algorithm_rl_module_restore.py b/rllib/algorithms/tests/test_algorithm_rl_module_restore.py index d13caa90766c..1dd50fb84035 100644 --- a/rllib/algorithms/tests/test_algorithm_rl_module_restore.py +++ b/rllib/algorithms/tests/test_algorithm_rl_module_restore.py @@ -17,7 +17,7 @@ MultiRLModule, ) from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check from ray.rllib.utils.numpy import convert_to_numpy @@ -63,123 +63,119 @@ def test_e2e_load_simple_multi_rl_module(self): """Test if we can train a PPO algo with a checkpointed MultiRLModule e2e.""" config = self.get_ppo_config() env = MultiAgentCartPole({"num_agents": NUM_AGENTS}) - for fw in framework_iterator(config, frameworks=["tf2", "torch"]): - # create a multi_rl_module to load and save it to a checkpoint directory - module_specs = {} - module_class = PPO_MODULES[fw] - for i in range(NUM_AGENTS): - module_specs[f"policy_{i}"] = RLModuleSpec( - module_class=module_class, - observation_space=env.observation_space[0], - action_space=env.action_space[0], - # If we want to use this externally created module in the algorithm, - # we need to provide the same config as the algorithm. - model_config_dict=config.model_config - | {"fcnet_hiddens": [32 * (i + 1)]}, - catalog_class=PPOCatalog, - ) - multi_rl_module_spec = MultiRLModuleSpec(module_specs=module_specs) - multi_rl_module = multi_rl_module_spec.build() - multi_rl_module_weights = convert_to_numpy(multi_rl_module.get_state()) - marl_checkpoint_path = tempfile.mkdtemp() - multi_rl_module.save_to_path(marl_checkpoint_path) - - # create a new MARL_spec with the checkpoint from the previous one - multi_rl_module_spec_from_checkpoint = MultiRLModuleSpec( - module_specs=module_specs, - load_state_path=marl_checkpoint_path, - ) - config = config.api_stack(enable_rl_module_and_learner=True).rl_module( - rl_module_spec=multi_rl_module_spec_from_checkpoint, + # create a multi_rl_module to load and save it to a checkpoint directory + module_specs = {} + module_class = PPO_MODULES["torch"] + for i in range(NUM_AGENTS): + module_specs[f"policy_{i}"] = RLModuleSpec( + module_class=module_class, + observation_space=env.observation_space[0], + action_space=env.action_space[0], + # If we want to use this externally created module in the algorithm, + # we need to provide the same config as the algorithm. + model_config_dict=config.model_config + | {"fcnet_hiddens": [32 * (i + 1)]}, + catalog_class=PPOCatalog, ) + multi_rl_module_spec = MultiRLModuleSpec(module_specs=module_specs) + multi_rl_module = multi_rl_module_spec.build() + multi_rl_module_weights = convert_to_numpy(multi_rl_module.get_state()) + marl_checkpoint_path = tempfile.mkdtemp() + multi_rl_module.save_to_path(marl_checkpoint_path) + + # create a new MARL_spec with the checkpoint from the previous one + multi_rl_module_spec_from_checkpoint = MultiRLModuleSpec( + module_specs=module_specs, + load_state_path=marl_checkpoint_path, + ) + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( + rl_module_spec=multi_rl_module_spec_from_checkpoint, + ) - # create the algorithm with multiple nodes and check if the weights - # are the same as the original MultiRLModule - algo = config.build() - algo_module_weights = algo.learner_group.get_weights() - check(algo_module_weights, multi_rl_module_weights) - algo.train() - algo.stop() - del algo - shutil.rmtree(marl_checkpoint_path) + # create the algorithm with multiple nodes and check if the weights + # are the same as the original MultiRLModule + algo = config.build() + algo_module_weights = algo.learner_group.get_weights() + check(algo_module_weights, multi_rl_module_weights) + algo.train() + algo.stop() + del algo + shutil.rmtree(marl_checkpoint_path) def test_e2e_load_complex_multi_rl_module(self): """Test if we can train a PPO algorithm with a cpkt MARL and RL module e2e.""" config = self.get_ppo_config() env = MultiAgentCartPole({"num_agents": NUM_AGENTS}) - for fw in framework_iterator(config, frameworks=["tf2", "torch"]): - # create a multi_rl_module to load and save it to a checkpoint directory - module_specs = {} - module_class = PPO_MODULES[fw] - for i in range(NUM_AGENTS): - module_specs[f"policy_{i}"] = RLModuleSpec( - module_class=module_class, - observation_space=env.observation_space[0], - action_space=env.action_space[0], - # If we want to use this externally created module in the algorithm, - # we need to provide the same config as the algorithm. - model_config_dict=config.model_config - | {"fcnet_hiddens": [32 * (i + 1)]}, - catalog_class=PPOCatalog, - ) - multi_rl_module_spec = MultiRLModuleSpec(module_specs=module_specs) - multi_rl_module = multi_rl_module_spec.build() - marl_checkpoint_path = tempfile.mkdtemp() - multi_rl_module.save_to_path(marl_checkpoint_path) - - # create a RLModule to load and override the "policy_1" module with - module_to_swap_in = RLModuleSpec( - module_class=module_class, - observation_space=env.observation_space[0], - action_space=env.action_space[0], - # Note, we need to pass in the default model config for the algorithm - # to be able to use this module later. - model_config_dict=config.model_config | {"fcnet_hiddens": [64]}, - catalog_class=PPOCatalog, - ).build() - - module_to_swap_in_path = tempfile.mkdtemp() - module_to_swap_in.save_to_path(module_to_swap_in_path) - - # create a new MARL_spec with the checkpoint from the marl_checkpoint - # and the module_to_swap_in_checkpoint - module_specs["policy_1"] = RLModuleSpec( + # create a multi_rl_module to load and save it to a checkpoint directory + module_specs = {} + module_class = PPO_MODULES["torch"] + for i in range(NUM_AGENTS): + module_specs[f"policy_{i}"] = RLModuleSpec( module_class=module_class, observation_space=env.observation_space[0], action_space=env.action_space[0], - model_config_dict={"fcnet_hiddens": [64]}, + # If we want to use this externally created module in the algorithm, + # we need to provide the same config as the algorithm. + model_config_dict=config.model_config + | {"fcnet_hiddens": [32 * (i + 1)]}, catalog_class=PPOCatalog, - load_state_path=module_to_swap_in_path, - ) - multi_rl_module_spec_from_checkpoint = MultiRLModuleSpec( - module_specs=module_specs, - load_state_path=marl_checkpoint_path, - ) - config = config.api_stack(enable_rl_module_and_learner=True).rl_module( - rl_module_spec=multi_rl_module_spec_from_checkpoint, ) + multi_rl_module_spec = MultiRLModuleSpec(module_specs=module_specs) + multi_rl_module = multi_rl_module_spec.build() + marl_checkpoint_path = tempfile.mkdtemp() + multi_rl_module.save_to_path(marl_checkpoint_path) + + # create a RLModule to load and override the "policy_1" module with + module_to_swap_in = RLModuleSpec( + module_class=module_class, + observation_space=env.observation_space[0], + action_space=env.action_space[0], + # Note, we need to pass in the default model config for the algorithm + # to be able to use this module later. + model_config_dict=config.model_config | {"fcnet_hiddens": [64]}, + catalog_class=PPOCatalog, + ).build() + + module_to_swap_in_path = tempfile.mkdtemp() + module_to_swap_in.save_to_path(module_to_swap_in_path) + + # create a new MARL_spec with the checkpoint from the marl_checkpoint + # and the module_to_swap_in_checkpoint + module_specs["policy_1"] = RLModuleSpec( + module_class=module_class, + observation_space=env.observation_space[0], + action_space=env.action_space[0], + model_config_dict={"fcnet_hiddens": [64]}, + catalog_class=PPOCatalog, + load_state_path=module_to_swap_in_path, + ) + multi_rl_module_spec_from_checkpoint = MultiRLModuleSpec( + module_specs=module_specs, + load_state_path=marl_checkpoint_path, + ) + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( + rl_module_spec=multi_rl_module_spec_from_checkpoint, + ) - # create the algorithm with multiple nodes and check if the weights - # are the same as the original MultiRLModule - algo = config.build() - algo_module_weights = algo.learner_group.get_weights() + # create the algorithm with multiple nodes and check if the weights + # are the same as the original MultiRLModule + algo = config.build() + algo_module_weights = algo.learner_group.get_weights() - multi_rl_module_with_swapped_in_module = MultiRLModule() - multi_rl_module_with_swapped_in_module.add_module( - "policy_0", multi_rl_module["policy_0"] - ) - multi_rl_module_with_swapped_in_module.add_module( - "policy_1", module_to_swap_in - ) + multi_rl_module_with_swapped_in_module = MultiRLModule() + multi_rl_module_with_swapped_in_module.add_module( + "policy_0", multi_rl_module["policy_0"] + ) + multi_rl_module_with_swapped_in_module.add_module("policy_1", module_to_swap_in) - check( - algo_module_weights, - convert_to_numpy(multi_rl_module_with_swapped_in_module.get_state()), - ) - algo.train() - algo.stop() - del algo - shutil.rmtree(marl_checkpoint_path) + check( + algo_module_weights, + convert_to_numpy(multi_rl_module_with_swapped_in_module.get_state()), + ) + algo.train() + algo.stop() + del algo + shutil.rmtree(marl_checkpoint_path) def test_e2e_load_rl_module(self): """Test if we can train a PPO algorithm with a cpkt RL module e2e.""" @@ -197,49 +193,48 @@ def test_e2e_load_rl_module(self): .training(num_sgd_iter=1, train_batch_size=8, sgd_minibatch_size=8) ) env = gym.make("CartPole-v1") - for fw in framework_iterator(config, frameworks=["tf2", "torch"]): - # create a multi_rl_module to load and save it to a checkpoint directory - module_class = PPO_MODULES[fw] - module_spec = RLModuleSpec( - module_class=module_class, - observation_space=env.observation_space, - action_space=env.action_space, - # If we want to use this externally created module in the algorithm, - # we need to provide the same config as the algorithm. - model_config_dict=config.model_config | {"fcnet_hiddens": [32]}, - catalog_class=PPOCatalog, - ) - module = module_spec.build() - - module_ckpt_path = tempfile.mkdtemp() - module.save_to_path(module_ckpt_path) - - module_to_load_spec = RLModuleSpec( - module_class=module_class, - observation_space=env.observation_space, - action_space=env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, - catalog_class=PPOCatalog, - load_state_path=module_ckpt_path, - ) + # create a multi_rl_module to load and save it to a checkpoint directory + module_class = PPO_MODULES["torch"] + module_spec = RLModuleSpec( + module_class=module_class, + observation_space=env.observation_space, + action_space=env.action_space, + # If we want to use this externally created module in the algorithm, + # we need to provide the same config as the algorithm. + model_config_dict=config.model_config | {"fcnet_hiddens": [32]}, + catalog_class=PPOCatalog, + ) + module = module_spec.build() + + module_ckpt_path = tempfile.mkdtemp() + module.save_to_path(module_ckpt_path) + + module_to_load_spec = RLModuleSpec( + module_class=module_class, + observation_space=env.observation_space, + action_space=env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + catalog_class=PPOCatalog, + load_state_path=module_ckpt_path, + ) - config = config.api_stack(enable_rl_module_and_learner=True).rl_module( - rl_module_spec=module_to_load_spec, - ) + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( + rl_module_spec=module_to_load_spec, + ) - # create the algorithm with multiple nodes and check if the weights - # are the same as the original MultiRLModule - algo = config.build() - algo_module_weights = algo.learner_group.get_weights() + # create the algorithm with multiple nodes and check if the weights + # are the same as the original MultiRLModule + algo = config.build() + algo_module_weights = algo.learner_group.get_weights() - check( - algo_module_weights[DEFAULT_MODULE_ID], - convert_to_numpy(module.get_state()), - ) - algo.train() - algo.stop() - del algo - shutil.rmtree(module_ckpt_path) + check( + algo_module_weights[DEFAULT_MODULE_ID], + convert_to_numpy(module.get_state()), + ) + algo.train() + algo.stop() + del algo + shutil.rmtree(module_ckpt_path) def test_e2e_load_complex_multi_rl_module_with_modules_to_load(self): """Test if we can train a PPO algorithm with a cpkt MARL and RL module e2e. @@ -251,107 +246,104 @@ def test_e2e_load_complex_multi_rl_module_with_modules_to_load(self): num_agents = 3 config = self.get_ppo_config(num_agents=num_agents) env = MultiAgentCartPole({"num_agents": num_agents}) - for fw in framework_iterator(config, frameworks=["tf2", "torch"]): - # create a multi_rl_module to load and save it to a checkpoint directory - module_specs = {} - module_class = PPO_MODULES[fw] - for i in range(num_agents): - module_specs[f"policy_{i}"] = RLModuleSpec( - module_class=module_class, - observation_space=env.observation_space[0], - action_space=env.action_space[0], - # Note, we need to pass in the default model config for the - # algorithm to be able to use this module later. - model_config_dict=config.model_config - | {"fcnet_hiddens": [32 * (i + 1)]}, - catalog_class=PPOCatalog, - ) - multi_rl_module_spec = MultiRLModuleSpec(module_specs=module_specs) - multi_rl_module = multi_rl_module_spec.build() - marl_checkpoint_path = tempfile.mkdtemp() - multi_rl_module.save_to_path(marl_checkpoint_path) - - # create a RLModule to load and override the "policy_1" module with - module_to_swap_in = RLModuleSpec( + # create a multi_rl_module to load and save it to a checkpoint directory + module_specs = {} + module_class = PPO_MODULES["torch"] + for i in range(num_agents): + module_specs[f"policy_{i}"] = RLModuleSpec( module_class=module_class, observation_space=env.observation_space[0], action_space=env.action_space[0], - # Note, we need to pass in the default model config for the algorithm - # to be able to use this module later. - model_config_dict=config.model_config | {"fcnet_hiddens": [64]}, + # Note, we need to pass in the default model config for the + # algorithm to be able to use this module later. + model_config_dict=config.model_config + | {"fcnet_hiddens": [32 * (i + 1)]}, catalog_class=PPOCatalog, - ).build() - - module_to_swap_in_path = tempfile.mkdtemp() - module_to_swap_in.save_to_path(module_to_swap_in_path) - - # create a new MARL_spec with the checkpoint from the marl_checkpoint - # and the module_to_swap_in_checkpoint - module_specs["policy_1"] = RLModuleSpec( - module_class=module_class, - observation_space=env.observation_space[0], - action_space=env.action_space[0], - model_config_dict={"fcnet_hiddens": [64]}, - catalog_class=PPOCatalog, - load_state_path=module_to_swap_in_path, - ) - multi_rl_module_spec_from_checkpoint = MultiRLModuleSpec( - module_specs=module_specs, - load_state_path=marl_checkpoint_path, - modules_to_load={ - "policy_0", - }, - ) - config = config.api_stack(enable_rl_module_and_learner=True).rl_module( - rl_module_spec=multi_rl_module_spec_from_checkpoint, ) + multi_rl_module_spec = MultiRLModuleSpec(module_specs=module_specs) + multi_rl_module = multi_rl_module_spec.build() + marl_checkpoint_path = tempfile.mkdtemp() + multi_rl_module.save_to_path(marl_checkpoint_path) + + # create a RLModule to load and override the "policy_1" module with + module_to_swap_in = RLModuleSpec( + module_class=module_class, + observation_space=env.observation_space[0], + action_space=env.action_space[0], + # Note, we need to pass in the default model config for the algorithm + # to be able to use this module later. + model_config_dict=config.model_config | {"fcnet_hiddens": [64]}, + catalog_class=PPOCatalog, + ).build() + + module_to_swap_in_path = tempfile.mkdtemp() + module_to_swap_in.save_to_path(module_to_swap_in_path) + + # create a new MARL_spec with the checkpoint from the marl_checkpoint + # and the module_to_swap_in_checkpoint + module_specs["policy_1"] = RLModuleSpec( + module_class=module_class, + observation_space=env.observation_space[0], + action_space=env.action_space[0], + model_config_dict={"fcnet_hiddens": [64]}, + catalog_class=PPOCatalog, + load_state_path=module_to_swap_in_path, + ) + multi_rl_module_spec_from_checkpoint = MultiRLModuleSpec( + module_specs=module_specs, + load_state_path=marl_checkpoint_path, + modules_to_load={ + "policy_0", + }, + ) + config = config.api_stack(enable_rl_module_and_learner=True).rl_module( + rl_module_spec=multi_rl_module_spec_from_checkpoint, + ) - # create the algorithm with multiple nodes and check if the weights - # are the same as the original MultiRLModule - algo = config.build() - algo_module_weights = algo.learner_group.get_weights() + # create the algorithm with multiple nodes and check if the weights + # are the same as the original MultiRLModule + algo = config.build() + algo_module_weights = algo.learner_group.get_weights() - # weights of "policy_0" should be the same as in the loaded MultiRLModule - # since we specified it as being apart of the modules_to_load - check( - algo_module_weights["policy_0"], - convert_to_numpy(multi_rl_module["policy_0"].get_state()), - ) - # weights of "policy_1" should be the same as in the module_to_swap_in since - # we specified its load path separately in an rl_module_spec inside of the - # multi_rl_module_spec_from_checkpoint - check( - algo_module_weights["policy_1"], - convert_to_numpy(module_to_swap_in.get_state()), - ) - # weights of "policy_2" should be different from the loaded MultiRLModule - # since we didn't specify it as being apart of the modules_to_load - policy_2_algo_module_weight_sum = np.sum( - [ - np.sum(s) - for s in tree.flatten( - convert_to_numpy(algo_module_weights["policy_2"]) - ) - ] - ) - policy_2_multi_rl_module_weight_sum = np.sum( - [ - np.sum(s) - for s in tree.flatten( - convert_to_numpy(multi_rl_module["policy_2"].get_state()) - ) - ] - ) - check( - policy_2_algo_module_weight_sum, - policy_2_multi_rl_module_weight_sum, - false=True, - ) + # weights of "policy_0" should be the same as in the loaded MultiRLModule + # since we specified it as being apart of the modules_to_load + check( + algo_module_weights["policy_0"], + convert_to_numpy(multi_rl_module["policy_0"].get_state()), + ) + # weights of "policy_1" should be the same as in the module_to_swap_in since + # we specified its load path separately in an rl_module_spec inside of the + # multi_rl_module_spec_from_checkpoint + check( + algo_module_weights["policy_1"], + convert_to_numpy(module_to_swap_in.get_state()), + ) + # weights of "policy_2" should be different from the loaded MultiRLModule + # since we didn't specify it as being apart of the modules_to_load + policy_2_algo_module_weight_sum = np.sum( + [ + np.sum(s) + for s in tree.flatten(convert_to_numpy(algo_module_weights["policy_2"])) + ] + ) + policy_2_multi_rl_module_weight_sum = np.sum( + [ + np.sum(s) + for s in tree.flatten( + convert_to_numpy(multi_rl_module["policy_2"].get_state()) + ) + ] + ) + check( + policy_2_algo_module_weight_sum, + policy_2_multi_rl_module_weight_sum, + false=True, + ) - algo.train() - algo.stop() - del algo - shutil.rmtree(marl_checkpoint_path) + algo.train() + algo.stop() + del algo + shutil.rmtree(marl_checkpoint_path) if __name__ == "__main__": diff --git a/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py b/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py index 3e8e9ef13079..3b71c09528bf 100644 --- a/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py +++ b/rllib/algorithms/tests/test_algorithm_save_load_checkpoint_learner.py @@ -6,7 +6,7 @@ from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.core import DEFAULT_MODULE_ID from ray.rllib.utils.metrics.learner_info import LEARNER_INFO -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check algorithms_and_configs = { @@ -96,27 +96,26 @@ def tearDowClass(cls) -> None: def test_save_and_restore(self): for algo_name in algorithms_and_configs: config = algorithms_and_configs[algo_name] - for _ in framework_iterator(config, frameworks=["torch", "tf2"]): - with tempfile.TemporaryDirectory() as tmpdir: - # create an algorithm, checkpoint it, then train for 2 iterations - ray.get(save_and_train.remote(config, "CartPole-v1", tmpdir)) - # load that checkpoint into a new algorithm and train for 2 - # iterations - results_algo_2 = ray.get( - load_and_train.remote(config, "CartPole-v1", tmpdir) - ) - - # load that checkpoint into another new algorithm and train for 2 - # iterations - results_algo_3 = ray.get( - load_and_train.remote(config, "CartPole-v1", tmpdir) - ) - - # check that the results are the same across loaded algorithms - # they won't be the same as the first algorithm since the random - # state that is used for each algorithm is not preserved across - # checkpoints. - check(results_algo_3, results_algo_2) + with tempfile.TemporaryDirectory() as tmpdir: + # create an algorithm, checkpoint it, then train for 2 iterations + ray.get(save_and_train.remote(config, "CartPole-v1", tmpdir)) + # load that checkpoint into a new algorithm and train for 2 + # iterations + results_algo_2 = ray.get( + load_and_train.remote(config, "CartPole-v1", tmpdir) + ) + + # load that checkpoint into another new algorithm and train for 2 + # iterations + results_algo_3 = ray.get( + load_and_train.remote(config, "CartPole-v1", tmpdir) + ) + + # check that the results are the same across loaded algorithms + # they won't be the same as the first algorithm since the random + # state that is used for each algorithm is not preserved across + # checkpoints. + check(results_algo_3, results_algo_2) if __name__ == "__main__": diff --git a/rllib/algorithms/tests/test_callbacks_old_stack.py b/rllib/algorithms/tests/test_callbacks_old_stack.py index f9045a18c694..dcbe2e516733 100644 --- a/rllib/algorithms/tests/test_callbacks_old_stack.py +++ b/rllib/algorithms/tests/test_callbacks_old_stack.py @@ -7,7 +7,6 @@ from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.evaluation.episode import Episode from ray.rllib.examples.envs.classes.random_env import RandomEnv -from ray.rllib.utils.test_utils import framework_iterator class EpisodeAndSampleCallbacks(DefaultCallbacks): @@ -82,16 +81,15 @@ def test_episode_and_sample_callbacks(self): .callbacks(EpisodeAndSampleCallbacks) .training(train_batch_size=50, sgd_minibatch_size=50, num_sgd_iter=1) ) - for _ in framework_iterator(config, frameworks=("tf", "torch")): - algo = config.build() - algo.train() - algo.train() - callback_obj = algo.env_runner.callbacks - self.assertGreater(callback_obj.counts["sample"], 0) - self.assertGreater(callback_obj.counts["start"], 0) - self.assertGreater(callback_obj.counts["end"], 0) - self.assertGreater(callback_obj.counts["step"], 0) - algo.stop() + algo = config.build() + algo.train() + algo.train() + callback_obj = algo.env_runner.callbacks + self.assertGreater(callback_obj.counts["sample"], 0) + self.assertGreater(callback_obj.counts["start"], 0) + self.assertGreater(callback_obj.counts["end"], 0) + self.assertGreater(callback_obj.counts["step"], 0) + algo.stop() def test_on_sub_environment_created(self): @@ -108,24 +106,23 @@ def test_on_sub_environment_created(self): ): config.callbacks(callbacks) - for _ in framework_iterator(config, frameworks=("tf", "torch")): - algo = config.build() - # Fake the counter on the local worker (doesn't have an env) and - # set it to -1 so the below `foreach_worker()` won't fail. - algo.env_runner.sum_sub_env_vector_indices = -1 - - # Get sub-env vector index sums from the 2 remote workers: - sum_sub_env_vector_indices = algo.env_runner_group.foreach_worker( - lambda w: w.sum_sub_env_vector_indices - ) - # Local worker has no environments -> Expect the -1 special - # value returned by the above lambda. - self.assertTrue(sum_sub_env_vector_indices[0] == -1) - # Both remote workers (index 1 and 2) have a vector index counter - # of 6 (sum of vector indices: 0 + 1 + 2 + 3). - self.assertTrue(sum_sub_env_vector_indices[1] == 6) - self.assertTrue(sum_sub_env_vector_indices[2] == 6) - algo.stop() + algo = config.build() + # Fake the counter on the local worker (doesn't have an env) and + # set it to -1 so the below `foreach_worker()` won't fail. + algo.env_runner.sum_sub_env_vector_indices = -1 + + # Get sub-env vector index sums from the 2 remote workers: + sum_sub_env_vector_indices = algo.env_runner_group.foreach_worker( + lambda w: w.sum_sub_env_vector_indices + ) + # Local worker has no environments -> Expect the -1 special + # value returned by the above lambda. + self.assertTrue(sum_sub_env_vector_indices[0] == -1) + # Both remote workers (index 1 and 2) have a vector index counter + # of 6 (sum of vector indices: 0 + 1 + 2 + 3). + self.assertTrue(sum_sub_env_vector_indices[1] == 6) + self.assertTrue(sum_sub_env_vector_indices[2] == 6) + algo.stop() def test_on_sub_environment_created_with_remote_envs(self): config = ( @@ -148,24 +145,23 @@ def test_on_sub_environment_created_with_remote_envs(self): ): config.callbacks(callbacks) - for _ in framework_iterator(config, frameworks=("tf", "torch")): - algo = config.build() - # Fake the counter on the local worker (doesn't have an env) and - # set it to -1 so the below `foreach_worker()` won't fail. - algo.env_runner.sum_sub_env_vector_indices = -1 - - # Get sub-env vector index sums from the 2 remote workers: - sum_sub_env_vector_indices = algo.env_runner_group.foreach_worker( - lambda w: w.sum_sub_env_vector_indices - ) - # Local worker has no environments -> Expect the -1 special - # value returned by the above lambda. - self.assertTrue(sum_sub_env_vector_indices[0] == -1) - # Both remote workers (index 1 and 2) have a vector index counter - # of 6 (sum of vector indices: 0 + 1 + 2 + 3). - self.assertTrue(sum_sub_env_vector_indices[1] == 6) - self.assertTrue(sum_sub_env_vector_indices[2] == 6) - algo.stop() + algo = config.build() + # Fake the counter on the local worker (doesn't have an env) and + # set it to -1 so the below `foreach_worker()` won't fail. + algo.env_runner.sum_sub_env_vector_indices = -1 + + # Get sub-env vector index sums from the 2 remote workers: + sum_sub_env_vector_indices = algo.env_runner_group.foreach_worker( + lambda w: w.sum_sub_env_vector_indices + ) + # Local worker has no environments -> Expect the -1 special + # value returned by the above lambda. + self.assertTrue(sum_sub_env_vector_indices[0] == -1) + # Both remote workers (index 1 and 2) have a vector index counter + # of 6 (sum of vector indices: 0 + 1 + 2 + 3). + self.assertTrue(sum_sub_env_vector_indices[1] == 6) + self.assertTrue(sum_sub_env_vector_indices[2] == 6) + algo.stop() def test_on_episode_created(self): # 1000 steps sampled (2.5 episodes on each sub-environment) before training diff --git a/rllib/algorithms/tests/test_callbacks_on_algorithm.py b/rllib/algorithms/tests/test_callbacks_on_algorithm.py index 05021456cb91..241d9ad31afa 100644 --- a/rllib/algorithms/tests/test_callbacks_on_algorithm.py +++ b/rllib/algorithms/tests/test_callbacks_on_algorithm.py @@ -7,7 +7,6 @@ from ray.rllib.algorithms.callbacks import DefaultCallbacks from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.examples.envs.classes.cartpole_crashing import CartPoleCrashing -from ray.rllib.utils.test_utils import framework_iterator from ray import tune @@ -98,22 +97,19 @@ def test_on_init_and_checkpoint_loaded(self): .environment("CartPole-v1") .callbacks(InitAndCheckpointRestoredCallbacks) ) - for _ in framework_iterator(config, frameworks=("torch", "tf2")): - algo = config.build() - self.assertTrue(algo.callbacks._on_init_was_called) + algo = config.build() + self.assertTrue(algo.callbacks._on_init_was_called) + self.assertTrue(not hasattr(algo.callbacks, "_on_checkpoint_loaded_was_called")) + algo.train() + # Save algo and restore. + with tempfile.TemporaryDirectory() as tmpdir: + algo.save(checkpoint_dir=tmpdir) self.assertTrue( not hasattr(algo.callbacks, "_on_checkpoint_loaded_was_called") ) - algo.train() - # Save algo and restore. - with tempfile.TemporaryDirectory() as tmpdir: - algo.save(checkpoint_dir=tmpdir) - self.assertTrue( - not hasattr(algo.callbacks, "_on_checkpoint_loaded_was_called") - ) - algo.load_checkpoint(checkpoint_dir=tmpdir) - self.assertTrue(algo.callbacks._on_checkpoint_loaded_was_called) - algo.stop() + algo.load_checkpoint(checkpoint_dir=tmpdir) + self.assertTrue(algo.callbacks._on_checkpoint_loaded_was_called) + algo.stop() if __name__ == "__main__": diff --git a/rllib/core/learner/tests/test_learner.py b/rllib/core/learner/tests/test_learner.py index c1d60b71e201..815b9b54a2d4 100644 --- a/rllib/core/learner/tests/test_learner.py +++ b/rllib/core/learner/tests/test_learner.py @@ -9,15 +9,10 @@ from ray.rllib.core.testing.testing_learner import BaseTestingAlgorithmConfig from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import ( - check, - framework_iterator, - get_cartpole_dataset_reader, -) +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader from ray.rllib.utils.metrics import ALL_MODULES -_, tf, _ = try_import_tf() torch, _ = try_import_torch() @@ -37,22 +32,17 @@ def test_end_to_end_update(self): config = BaseTestingAlgorithmConfig() - for _ in framework_iterator(config, frameworks=("torch", "tf2")): - learner = config.build_learner(env=self.ENV) - reader = get_cartpole_dataset_reader(batch_size=512) - - min_loss = float("inf") - for iter_i in range(1000): - batch = reader.next() - results = learner.update_from_batch(batch=batch.as_multi_agent()) - - loss = results[ALL_MODULES][Learner.TOTAL_LOSS_KEY] - min_loss = min(loss, min_loss) - print(f"[iter = {iter_i}] Loss: {loss:.3f}, Min Loss: {min_loss:.3f}") - # The loss is initially around 0.69 (ln2). When it gets to around - # 0.58 the return of the policy gets to around 100. - if min_loss < 0.58: - break + learner = config.build_learner(env=self.ENV) + reader = get_cartpole_dataset_reader(batch_size=512) + + min_loss = float("inf") + for iter_i in range(1000): + batch = reader.next() + results = learner.update_from_batch(batch=batch.as_multi_agent()) + + loss = results[ALL_MODULES][Learner.TOTAL_LOSS_KEY] + min_loss = min(loss, min_loss) + print(f"[iter = {iter_i}] Loss: {loss:.3f}, Min Loss: {min_loss:.3f}") self.assertLess(min_loss, 0.58) def test_compute_gradients(self): @@ -63,27 +53,20 @@ def test_compute_gradients(self): """ config = BaseTestingAlgorithmConfig() - for fw in framework_iterator(config, frameworks=("torch", "tf2")): - learner = config.build_learner(env=self.ENV) + learner = config.build_learner(env=self.ENV) - params = learner.get_parameters(learner.module[DEFAULT_MODULE_ID]) + params = learner.get_parameters(learner.module[DEFAULT_MODULE_ID]) - tape = None - if fw == "torch": - loss_per_module = {ALL_MODULES: sum(param.sum() for param in params)} - else: - with tf.GradientTape() as tape: - loss_per_module = { - ALL_MODULES: sum(tf.reduce_sum(param) for param in params) - } + tape = None + loss_per_module = {ALL_MODULES: sum(param.sum() for param in params)} - gradients = learner.compute_gradients(loss_per_module, gradient_tape=tape) + gradients = learner.compute_gradients(loss_per_module, gradient_tape=tape) - # Type should be a mapping from ParamRefs to gradients. - self.assertIsInstance(gradients, dict) + # Type should be a mapping from ParamRefs to gradients. + self.assertIsInstance(gradients, dict) - for grad in gradients.values(): - check(grad, np.ones(grad.shape)) + for grad in gradients.values(): + check(grad, np.ones(grad.shape)) def test_postprocess_gradients(self): """Tests the base grad clipping logic in `postprocess_gradients()`.""" @@ -93,69 +76,66 @@ def test_postprocess_gradients(self): lr=0.0003, grad_clip=0.75, grad_clip_by="value" ) - for fw in framework_iterator(config, frameworks=("torch", "tf2")): - learner = config.build_learner(env=self.ENV) - # Pretend our computed gradients are our weights + 1.0. - grads = { - learner.get_param_ref(v): v + 1.0 - for v in learner.get_parameters(learner.module[DEFAULT_MODULE_ID]) - } - # Call the learner's postprocessing method. - processed_grads = list(learner.postprocess_gradients(grads).values()) - # Check clipped gradients. - # No single gradient must be larger than 0.1 or smaller than -0.1: - self.assertTrue( - all( - np.max(grad) <= config.grad_clip - and np.min(grad) >= -config.grad_clip - for grad in convert_to_numpy(processed_grads) - ) + learner = config.build_learner(env=self.ENV) + # Pretend our computed gradients are our weights + 1.0. + grads = { + learner.get_param_ref(v): v + 1.0 + for v in learner.get_parameters(learner.module[DEFAULT_MODULE_ID]) + } + # Call the learner's postprocessing method. + processed_grads = list(learner.postprocess_gradients(grads).values()) + # Check clipped gradients. + # No single gradient must be larger than 0.1 or smaller than -0.1: + self.assertTrue( + all( + np.max(grad) <= config.grad_clip and np.min(grad) >= -config.grad_clip + for grad in convert_to_numpy(processed_grads) ) + ) - # Clip by norm. - config.grad_clip = 1.0 - config.grad_clip_by = "norm" - learner = config.build_learner(env=self.ENV) - # Pretend our computed gradients are our weights + 1.0. - grads = { - learner.get_param_ref(v): v + 1.0 - for v in learner.get_parameters(learner.module[DEFAULT_MODULE_ID]) - } - # Call the learner's postprocessing method. - processed_grads = list(learner.postprocess_gradients(grads).values()) - # Check clipped gradients. + # Clip by norm. + config.grad_clip = 1.0 + config.grad_clip_by = "norm" + learner = config.build_learner(env=self.ENV) + # Pretend our computed gradients are our weights + 1.0. + grads = { + learner.get_param_ref(v): v + 1.0 + for v in learner.get_parameters(learner.module[DEFAULT_MODULE_ID]) + } + # Call the learner's postprocessing method. + processed_grads = list(learner.postprocess_gradients(grads).values()) + # Check clipped gradients. + for proc_grad, grad in zip( + convert_to_numpy(processed_grads), + convert_to_numpy(list(grads.values())), + ): + l2_norm = np.sqrt(np.sum(grad**2.0)) + if l2_norm > config.grad_clip: + check(proc_grad, grad * (config.grad_clip / l2_norm)) + + # Clip by global norm. + config.grad_clip = 5.0 + config.grad_clip_by = "global_norm" + learner = config.build_learner(env=self.ENV) + # Pretend our computed gradients are our weights + 1.0. + grads = { + learner.get_param_ref(v): v + 1.0 + for v in learner.get_parameters(learner.module[DEFAULT_MODULE_ID]) + } + # Call the learner's postprocessing method. + processed_grads = list(learner.postprocess_gradients(grads).values()) + # Check clipped gradients. + global_norm = np.sqrt( + np.sum( + np.sum(grad**2.0) for grad in convert_to_numpy(list(grads.values())) + ) + ) + if global_norm > config.grad_clip: for proc_grad, grad in zip( convert_to_numpy(processed_grads), - convert_to_numpy(list(grads.values())), + grads.values(), ): - l2_norm = np.sqrt(np.sum(grad**2.0)) - if l2_norm > config.grad_clip: - check(proc_grad, grad * (config.grad_clip / l2_norm)) - - # Clip by global norm. - config.grad_clip = 5.0 - config.grad_clip_by = "global_norm" - learner = config.build_learner(env=self.ENV) - # Pretend our computed gradients are our weights + 1.0. - grads = { - learner.get_param_ref(v): v + 1.0 - for v in learner.get_parameters(learner.module[DEFAULT_MODULE_ID]) - } - # Call the learner's postprocessing method. - processed_grads = list(learner.postprocess_gradients(grads).values()) - # Check clipped gradients. - global_norm = np.sqrt( - np.sum( - np.sum(grad**2.0) - for grad in convert_to_numpy(list(grads.values())) - ) - ) - if global_norm > config.grad_clip: - for proc_grad, grad in zip( - convert_to_numpy(processed_grads), - grads.values(), - ): - check(proc_grad, grad * (config.grad_clip / global_norm)) + check(proc_grad, grad * (config.grad_clip / global_norm)) def test_apply_gradients(self): """Tests the apply_gradients correctness. @@ -165,31 +145,23 @@ def test_apply_gradients(self): """ config = BaseTestingAlgorithmConfig().training(lr=0.0003) - for fw in framework_iterator(config, frameworks=("torch", "tf2")): - learner = config.build_learner(env=self.ENV) - - # calculated the expected new params based on gradients of all ones. - params = learner.get_parameters(learner.module[DEFAULT_MODULE_ID]) - n_steps = 100 - expected = [ - ( - convert_to_numpy(param) - - n_steps * learner.config.lr * np.ones(param.shape) - ) - for param in params - ] - for _ in range(n_steps): - if fw == "torch": - gradients = { - learner.get_param_ref(p): torch.ones_like(p) for p in params - } - else: - gradients = { - learner.get_param_ref(p): tf.ones_like(p) for p in params - } - learner.apply_gradients(gradients) - - check(params, expected) + learner = config.build_learner(env=self.ENV) + + # calculated the expected new params based on gradients of all ones. + params = learner.get_parameters(learner.module[DEFAULT_MODULE_ID]) + n_steps = 100 + expected = [ + ( + convert_to_numpy(param) + - n_steps * learner.config.lr * np.ones(param.shape) + ) + for param in params + ] + for _ in range(n_steps): + gradients = {learner.get_param_ref(p): torch.ones_like(p) for p in params} + learner.apply_gradients(gradients) + + check(params, expected) def test_add_remove_module(self): """Tests the compute/apply_gradients with add/remove modules. @@ -200,79 +172,66 @@ def test_add_remove_module(self): """ config = BaseTestingAlgorithmConfig().training(lr=0.0003) - for fw in framework_iterator(config, frameworks=("torch", "tf2")): - learner = config.build_learner(env=self.ENV) - rl_module_spec = config.get_default_rl_module_spec() - rl_module_spec.observation_space = self.ENV.observation_space - rl_module_spec.action_space = self.ENV.action_space - learner.add_module( - module_id="test", - module_spec=rl_module_spec, - ) - learner.remove_module(DEFAULT_MODULE_ID) - - # only test module should be left - self.assertEqual(set(learner.module.keys()), {"test"}) + learner = config.build_learner(env=self.ENV) + rl_module_spec = config.get_default_rl_module_spec() + rl_module_spec.observation_space = self.ENV.observation_space + rl_module_spec.action_space = self.ENV.action_space + learner.add_module( + module_id="test", + module_spec=rl_module_spec, + ) + learner.remove_module(DEFAULT_MODULE_ID) + + # only test module should be left + self.assertEqual(set(learner.module.keys()), {"test"}) + + # calculated the expected new params based on gradients of all ones. + params = learner.get_parameters(learner.module["test"]) + n_steps = 100 + expected = [ + convert_to_numpy(param) - n_steps * learner.config.lr * np.ones(param.shape) + for param in params + ] + for _ in range(n_steps): + tape = None + loss_per_module = {ALL_MODULES: sum(param.sum() for param in params)} + gradients = learner.compute_gradients(loss_per_module, gradient_tape=tape) + learner.apply_gradients(gradients) - # calculated the expected new params based on gradients of all ones. - params = learner.get_parameters(learner.module["test"]) - n_steps = 100 - expected = [ - convert_to_numpy(param) - - n_steps * learner.config.lr * np.ones(param.shape) - for param in params - ] - for _ in range(n_steps): - tape = None - if fw == "torch": - loss_per_module = { - ALL_MODULES: sum(param.sum() for param in params) - } - else: - with tf.GradientTape() as tape: - loss_per_module = { - ALL_MODULES: sum(tf.reduce_sum(param) for param in params) - } - gradients = learner.compute_gradients( - loss_per_module, gradient_tape=tape - ) - learner.apply_gradients(gradients) - - check(params, expected) + check(params, expected) def test_save_to_path_and_restore_from_path(self): """Tests, whether a Learner's state is properly saved and restored.""" config = BaseTestingAlgorithmConfig() - for fw in framework_iterator(config, frameworks=("torch", "tf2")): - # Get a Learner instance for the framework and env. - learner1 = config.build_learner(env=self.ENV) - with tempfile.TemporaryDirectory() as tmpdir: - learner1.save_to_path(tmpdir) - - learner2 = config.build_learner(env=self.ENV) - learner2.restore_from_path(tmpdir) - self._check_learner_states(fw, learner1, learner2) - - # Add a module then save/load and check states. - with tempfile.TemporaryDirectory() as tmpdir: - rl_module_spec = config.get_default_rl_module_spec() - rl_module_spec.observation_space = self.ENV.observation_space - rl_module_spec.action_space = self.ENV.action_space - learner1.add_module( - module_id="test", - module_spec=rl_module_spec, - ) - learner1.save_to_path(tmpdir) - learner2 = Learner.from_checkpoint(tmpdir) - self._check_learner_states(fw, learner1, learner2) - - # Remove a module then save/load and check states. - with tempfile.TemporaryDirectory() as tmpdir: - learner1.remove_module(module_id=DEFAULT_MODULE_ID) - learner1.save_to_path(tmpdir) - learner2 = Learner.from_checkpoint(tmpdir) - self._check_learner_states(fw, learner1, learner2) + # Get a Learner instance for the framework and env. + learner1 = config.build_learner(env=self.ENV) + with tempfile.TemporaryDirectory() as tmpdir: + learner1.save_to_path(tmpdir) + + learner2 = config.build_learner(env=self.ENV) + learner2.restore_from_path(tmpdir) + self._check_learner_states("torch", learner1, learner2) + + # Add a module then save/load and check states. + with tempfile.TemporaryDirectory() as tmpdir: + rl_module_spec = config.get_default_rl_module_spec() + rl_module_spec.observation_space = self.ENV.observation_space + rl_module_spec.action_space = self.ENV.action_space + learner1.add_module( + module_id="test", + module_spec=rl_module_spec, + ) + learner1.save_to_path(tmpdir) + learner2 = Learner.from_checkpoint(tmpdir) + self._check_learner_states("torch", learner1, learner2) + + # Remove a module then save/load and check states. + with tempfile.TemporaryDirectory() as tmpdir: + learner1.remove_module(module_id=DEFAULT_MODULE_ID) + learner1.save_to_path(tmpdir) + learner2 = Learner.from_checkpoint(tmpdir) + self._check_learner_states("torch", learner1, learner2) def _check_learner_states(self, framework, learner1, learner2): check(learner1.module.get_state(), learner2.module.get_state()) diff --git a/rllib/core/models/tests/test_catalog.py b/rllib/core/models/tests/test_catalog.py index 86d561a3f752..7189f02dceaa 100644 --- a/rllib/core/models/tests/test_catalog.py +++ b/rllib/core/models/tests/test_catalog.py @@ -44,7 +44,6 @@ from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.numpy import convert_to_numpy from ray.rllib.utils.spaces.space_utils import get_dummy_batch_for_space -from ray.rllib.utils.test_utils import framework_iterator from ray.rllib.utils.torch_utils import convert_to_torch_tensor _, tf, _ = try_import_tf() @@ -320,63 +319,54 @@ def test_get_dist_cls_from_action_space(self): model_config_dict=MODEL_DEFAULTS.copy(), ) - for framework in framework_iterator(frameworks=["tf2", "torch"]): - - if framework == "tf2": - framework = "tf2" + dist_cls = catalog._get_dist_cls_from_action_space( + action_space=action_space, + framework="torch", + ) - dist_cls = catalog._get_dist_cls_from_action_space( + # Check if we can query the required input dimensions + expected_cls = expected_cls_dict["torch"] + if ( + expected_cls is TorchMultiDistribution + or expected_cls is TfMultiDistribution + ): + # For these special cases, we need to create partials of the + # expected classes so that we can calculate the required inputs + expected_cls = _multi_action_dist_partial_helper( + catalog_cls=catalog, action_space=action_space, - framework=framework, + framework="torch", ) - - # Check if we can query the required input dimensions - expected_cls = expected_cls_dict[framework] - if ( - expected_cls is TorchMultiDistribution - or expected_cls is TfMultiDistribution - ): - # For these special cases, we need to create partials of the - # expected classes so that we can calculate the required inputs - expected_cls = _multi_action_dist_partial_helper( - catalog_cls=catalog, - action_space=action_space, - framework=framework, - ) - elif ( - expected_cls is TorchMultiCategorical - or expected_cls is TfMultiCategorical - ): - # For these special cases, we need to create partials of the - # expected classes so that we can calculate the required inputs - expected_cls = _multi_categorical_dist_partial_helper( - action_space=action_space, framework=framework - ) - - # Now that we have sorted out special cases, we can finally get the - # input_dim - input_dim = expected_cls.required_input_dim(action_space) - logits = np.ones((32, input_dim), dtype=np.float32) - if framework == "torch": - logits = torch.from_numpy(logits) - else: - logits = tf.convert_to_tensor(logits) - # We don't need a model if we input tensors - dist = dist_cls.from_logits(logits=logits) - self.assertTrue( - isinstance(dist, expected_cls_dict[framework]), - msg=f"Expected {expected_cls_dict[framework]}, " - f"got {type(dist)}", + elif ( + expected_cls is TorchMultiCategorical + or expected_cls is TfMultiCategorical + ): + # For these special cases, we need to create partials of the + # expected classes so that we can calculate the required inputs + expected_cls = _multi_categorical_dist_partial_helper( + action_space=action_space, framework="torch" ) - # Test if sampling works - actions = dist.sample() - # Test is logp works - dist.logp(actions) - - # For any array of actions in a possibly nested space, convert to - # numpy and pick the first one to check if it is in the action space. - action = tree.map_structure(lambda a: convert_to_numpy(a)[0], actions) - self.assertTrue(action_space.contains(action)) + + # Now that we have sorted out special cases, we can finally get the + # input_dim + input_dim = expected_cls.required_input_dim(action_space) + logits = np.ones((32, input_dim), dtype=np.float32) + logits = torch.from_numpy(logits) + # We don't need a model if we input tensors + dist = dist_cls.from_logits(logits=logits) + self.assertTrue( + isinstance(dist, expected_cls_dict["torch"]), + msg=f"Expected {expected_cls_dict['torch']}, " f"got {type(dist)}", + ) + # Test if sampling works + actions = dist.sample() + # Test is logp works + dist.logp(actions) + + # For any array of actions in a possibly nested space, convert to + # numpy and pick the first one to check if it is in the action space. + action = tree.map_structure(lambda a: convert_to_numpy(a)[0], actions) + self.assertTrue(action_space.contains(action)) def test_customize_catalog_from_algorithm_config(self): """Test if we can pass catalog to algorithm config and it ends up inside diff --git a/rllib/core/models/tests/test_cnn_encoders.py b/rllib/core/models/tests/test_cnn_encoders.py index c0536ca2bc6d..d7b344aba375 100644 --- a/rllib/core/models/tests/test_cnn_encoders.py +++ b/rllib/core/models/tests/test_cnn_encoders.py @@ -4,10 +4,9 @@ from ray.rllib.core.models.base import ENCODER_OUT from ray.rllib.core.models.configs import CNNEncoderConfig from ray.rllib.models.utils import get_filter_config -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import framework_iterator, ModelChecker +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import ModelChecker -_, tf, _ = try_import_tf() torch, _ = try_import_torch() @@ -67,11 +66,10 @@ def test_cnn_encoders(self): # with each other. model_checker = ModelChecker(config) - for fw in framework_iterator(frameworks=("tf2", "torch")): - # Add this framework version of the model to our checker. - outputs = model_checker.add(framework=fw) - # Confirm that the config conputed the correct (actual) output dims. - self.assertEqual(outputs[ENCODER_OUT].shape, (1, config.output_dims[0])) + # Add this framework version of the model to our checker. + outputs = model_checker.add(framework="torch") + # Confirm that the config conputed the correct (actual) output dims. + self.assertEqual(outputs[ENCODER_OUT].shape, (1, config.output_dims[0])) # Check all added models against each other. model_checker.check() @@ -97,11 +95,10 @@ def test_cnn_encoders_valid_padding(self): # with each other. model_checker = ModelChecker(config) - for fw in framework_iterator(frameworks=("tf2", "torch")): - # Add this framework version of the model to our checker. - outputs = model_checker.add(framework=fw) - # Confirm that the config conputed the correct (actual) output dims. - self.assertEqual(outputs[ENCODER_OUT].shape, (1, config.output_dims[0])) + # Add this framework version of the model to our checker. + outputs = model_checker.add(framework="torch") + # Confirm that the config conputed the correct (actual) output dims. + self.assertEqual(outputs[ENCODER_OUT].shape, (1, config.output_dims[0])) # Check all added models against each other. model_checker.check() diff --git a/rllib/core/models/tests/test_cnn_transpose_heads.py b/rllib/core/models/tests/test_cnn_transpose_heads.py index 2b7b38c00bbc..2c5a0d13c037 100644 --- a/rllib/core/models/tests/test_cnn_transpose_heads.py +++ b/rllib/core/models/tests/test_cnn_transpose_heads.py @@ -2,10 +2,9 @@ import unittest from ray.rllib.core.models.configs import CNNTransposeHeadConfig -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import framework_iterator, ModelChecker +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import ModelChecker -_, tf, _ = try_import_tf() torch, _ = try_import_torch() @@ -92,10 +91,9 @@ def test_cnn_transpose_heads(self): # with each other. model_checker = ModelChecker(config) - for fw in framework_iterator(frameworks=("tf2", "torch")): - # Add this framework version of the model to our checker. - outputs = model_checker.add(framework=fw) - self.assertEqual(outputs.shape, (1,) + tuple(expected_output_dims)) + # Add this framework version of the model to our checker. + outputs = model_checker.add(framework="torch") + self.assertEqual(outputs.shape, (1,) + tuple(expected_output_dims)) # Check all added models against each other. model_checker.check() diff --git a/rllib/core/models/tests/test_mlp_encoders.py b/rllib/core/models/tests/test_mlp_encoders.py index 25fecf3f5235..96b5fc45dbe3 100644 --- a/rllib/core/models/tests/test_mlp_encoders.py +++ b/rllib/core/models/tests/test_mlp_encoders.py @@ -3,10 +3,9 @@ from ray.rllib.core.models.configs import MLPEncoderConfig from ray.rllib.core.models.base import ENCODER_OUT -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import framework_iterator, ModelChecker +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import ModelChecker -_, tf, _ = try_import_tf() torch, _ = try_import_torch() @@ -72,10 +71,9 @@ def test_mlp_encoders(self): # with each other. model_checker = ModelChecker(config) - for fw in framework_iterator(frameworks=("tf2", "torch")): - # Add this framework version of the model to our checker. - outputs = model_checker.add(framework=fw) - self.assertEqual(outputs[ENCODER_OUT].shape, (1, output_dim)) + # Add this framework version of the model to our checker. + outputs = model_checker.add(framework="torch") + self.assertEqual(outputs[ENCODER_OUT].shape, (1, output_dim)) # Check all added models against each other. model_checker.check() diff --git a/rllib/core/models/tests/test_mlp_heads.py b/rllib/core/models/tests/test_mlp_heads.py index 366814b5d584..fcdcf0ac9695 100644 --- a/rllib/core/models/tests/test_mlp_heads.py +++ b/rllib/core/models/tests/test_mlp_heads.py @@ -2,10 +2,9 @@ import unittest from ray.rllib.core.models.configs import MLPHeadConfig, FreeLogStdMLPHeadConfig -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import framework_iterator, ModelChecker +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import ModelChecker -_, tf, _ = try_import_tf() torch, nn = try_import_torch() @@ -77,10 +76,9 @@ def test_mlp_heads(self): # with each other. model_checker = ModelChecker(config) - for fw in framework_iterator(frameworks=("tf2", "torch")): - # Add this framework version of the model to our checker. - outputs = model_checker.add(framework=fw) - self.assertEqual(outputs.shape, (1, output_dim)) + # Add this framework version of the model to our checker. + outputs = model_checker.add(framework="torch") + self.assertEqual(outputs.shape, (1, output_dim)) # Check all added models against each other. model_checker.check() diff --git a/rllib/core/models/tests/test_recurrent_encoders.py b/rllib/core/models/tests/test_recurrent_encoders.py index f3fe42f34c8d..e2ba68be01b7 100644 --- a/rllib/core/models/tests/test_recurrent_encoders.py +++ b/rllib/core/models/tests/test_recurrent_encoders.py @@ -4,10 +4,9 @@ from ray.rllib.core.columns import Columns from ray.rllib.core.models.base import ENCODER_OUT from ray.rllib.core.models.configs import RecurrentEncoderConfig -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import framework_iterator, ModelChecker +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.test_utils import ModelChecker -_, tf, _ = try_import_tf() torch, _ = try_import_torch() @@ -54,19 +53,18 @@ def test_gru_encoders(self): # with each other. model_checker = ModelChecker(config) - for fw in framework_iterator(frameworks=("tf2", "torch")): - # Add this framework version of the model to our checker. - outputs = model_checker.add(framework=fw) - # Output shape: [1=B, 1=T, [output_dim]] - self.assertEqual( - outputs[ENCODER_OUT].shape, - (1, 1, config.output_dims[0]), - ) - # State shapes: [1=B, 1=num_layers, [hidden_dim]] - self.assertEqual( - outputs[Columns.STATE_OUT]["h"].shape, - (1, num_layers, hidden_dim), - ) + # Add this framework version of the model to our checker. + outputs = model_checker.add(framework="torch") + # Output shape: [1=B, 1=T, [output_dim]] + self.assertEqual( + outputs[ENCODER_OUT].shape, + (1, 1, config.output_dims[0]), + ) + # State shapes: [1=B, 1=num_layers, [hidden_dim]] + self.assertEqual( + outputs[Columns.STATE_OUT]["h"].shape, + (1, num_layers, hidden_dim), + ) # Check all added models against each other. model_checker.check() @@ -112,23 +110,22 @@ def test_lstm_encoders(self): # with each other. model_checker = ModelChecker(config) - for fw in framework_iterator(frameworks=("tf2", "torch")): - # Add this framework version of the model to our checker. - outputs = model_checker.add(framework=fw) - # Output shape: [1=B, 1=T, [output_dim]] - self.assertEqual( - outputs[ENCODER_OUT].shape, - (1, 1, config.output_dims[0]), - ) - # State shapes: [1=B, 1=num_layers, [hidden_dim]] - self.assertEqual( - outputs[Columns.STATE_OUT]["h"].shape, - (1, num_layers, hidden_dim), - ) - self.assertEqual( - outputs[Columns.STATE_OUT]["c"].shape, - (1, num_layers, hidden_dim), - ) + # Add this framework version of the model to our checker. + outputs = model_checker.add(framework="torch") + # Output shape: [1=B, 1=T, [output_dim]] + self.assertEqual( + outputs[ENCODER_OUT].shape, + (1, 1, config.output_dims[0]), + ) + # State shapes: [1=B, 1=num_layers, [hidden_dim]] + self.assertEqual( + outputs[Columns.STATE_OUT]["h"].shape, + (1, num_layers, hidden_dim), + ) + self.assertEqual( + outputs[Columns.STATE_OUT]["c"].shape, + (1, num_layers, hidden_dim), + ) # Check all added models against each other (only if bias=False). # See here on why pytorch uses two bias vectors per layer and tf only uses diff --git a/rllib/core/testing/tests/test_bc_algorithm.py b/rllib/core/testing/tests/test_bc_algorithm.py index 9403e183eda3..c40e09400301 100644 --- a/rllib/core/testing/tests/test_bc_algorithm.py +++ b/rllib/core/testing/tests/test_bc_algorithm.py @@ -8,14 +8,12 @@ BCTorchMultiAgentModuleWithSharedEncoder, ) from ray.rllib.core.testing.tf.bc_module import ( - DiscreteBCTFModule, BCTfRLModuleWithSharedGlobalEncoder, BCTfMultiAgentModuleWithSharedEncoder, ) from ray.rllib.core.rl_module.rl_module import RLModuleSpec from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec from ray.rllib.core.testing.bc_algorithm import BCConfigTest -from ray.rllib.utils.test_utils import framework_iterator from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole @@ -37,16 +35,11 @@ def test_bc_algorithm(self): .training(model={"fcnet_hiddens": [32, 32]}) ) - # TODO (Kourosh): Add tf2 support - for fw in framework_iterator(config, frameworks=("torch")): - algo = config.build(env="CartPole-v1") - policy = algo.get_policy() - rl_module = policy.model + algo = config.build(env="CartPole-v1") + policy = algo.get_policy() + rl_module = policy.model - if fw == "torch": - assert isinstance(rl_module, DiscreteBCTorchModule) - elif fw == "tf2": - assert isinstance(rl_module, DiscreteBCTFModule) + assert isinstance(rl_module, DiscreteBCTorchModule) def test_bc_algorithm_marl(self): """Tests simple extension of single-agent to independent multi-agent case.""" @@ -63,17 +56,12 @@ def test_bc_algorithm_marl(self): .environment(MultiAgentCartPole, env_config={"num_agents": 2}) ) - # TODO (Kourosh): Add tf2 support - for fw in framework_iterator(config, frameworks=("torch")): - algo = config.build() - for policy_id in policies: - policy = algo.get_policy(policy_id=policy_id) - rl_module = policy.model + algo = config.build() + for policy_id in policies: + policy = algo.get_policy(policy_id=policy_id) + rl_module = policy.model - if fw == "torch": - assert isinstance(rl_module, DiscreteBCTorchModule) - elif fw == "tf2": - assert isinstance(rl_module, DiscreteBCTFModule) + assert isinstance(rl_module, DiscreteBCTorchModule) def test_bc_algorithm_w_custom_multi_rl_module(self): """Tests the independent multi-agent case with shared encoders.""" diff --git a/rllib/evaluation/tests/test_rollout_worker.py b/rllib/evaluation/tests/test_rollout_worker.py index 2b98f3aaf726..4f8ed097170c 100644 --- a/rllib/evaluation/tests/test_rollout_worker.py +++ b/rllib/evaluation/tests/test_rollout_worker.py @@ -40,7 +40,7 @@ NUM_AGENT_STEPS_TRAINED, EPISODE_RETURN_MEAN, ) -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check from ray.tune.registry import register_env @@ -177,33 +177,26 @@ def test_global_vars_update(self): # lr = 0.1 - [(0.1 - 0.000001) / 100000] * ts .training(lr_schedule=[[0, 0.1], [100000, 0.000001]]) ) - for fw in framework_iterator(config, frameworks=("tf2", "tf")): - algo = config.build() - policy = algo.get_policy() - for i in range(3): - result = algo.train() - print( - "{}={}".format( - NUM_AGENT_STEPS_TRAINED, result["info"][NUM_AGENT_STEPS_TRAINED] - ) - ) - print( - "{}={}".format( - NUM_AGENT_STEPS_SAMPLED, result["info"][NUM_AGENT_STEPS_SAMPLED] - ) + algo = config.build() + policy = algo.get_policy() + for i in range(3): + result = algo.train() + print( + "{}={}".format( + NUM_AGENT_STEPS_TRAINED, result["info"][NUM_AGENT_STEPS_TRAINED] ) - global_timesteps = ( - policy.global_timestep - if fw == "tf" - else policy.global_timestep.numpy() + ) + print( + "{}={}".format( + NUM_AGENT_STEPS_SAMPLED, result["info"][NUM_AGENT_STEPS_SAMPLED] ) - print("global_timesteps={}".format(global_timesteps)) - expected_lr = 0.1 - ((0.1 - 0.000001) / 100000) * global_timesteps - lr = policy.cur_lr - if fw == "tf": - lr = policy.get_session().run(lr) - check(lr, expected_lr, rtol=0.05) - algo.stop() + ) + global_timesteps = policy.global_timestep + print("global_timesteps={}".format(global_timesteps)) + expected_lr = 0.1 - ((0.1 - 0.000001) / 100000) * global_timesteps + lr = policy.cur_lr + check(lr, expected_lr, rtol=0.05) + algo.stop() def test_query_evaluators(self): register_env("test", lambda _: gym.make("CartPole-v1")) @@ -217,21 +210,20 @@ def test_query_evaluators(self): ) .training(train_batch_size=20, sgd_minibatch_size=5, num_sgd_iter=1) ) - for _ in framework_iterator(config, frameworks=("torch", "tf")): - algo = config.build() - results = algo.env_runner_group.foreach_worker( - lambda w: w.total_rollout_fragment_length - ) - results2 = algo.env_runner_group.foreach_worker_with_id( - lambda i, w: (i, w.total_rollout_fragment_length) - ) - results3 = algo.env_runner_group.foreach_worker( - lambda w: w.foreach_env(lambda env: 1) - ) - self.assertEqual(results, [10, 10, 10]) - self.assertEqual(results2, [(0, 10), (1, 10), (2, 10)]) - self.assertEqual(results3, [[1, 1], [1, 1], [1, 1]]) - algo.stop() + algo = config.build() + results = algo.env_runner_group.foreach_worker( + lambda w: w.total_rollout_fragment_length + ) + results2 = algo.env_runner_group.foreach_worker_with_id( + lambda i, w: (i, w.total_rollout_fragment_length) + ) + results3 = algo.env_runner_group.foreach_worker( + lambda w: w.foreach_env(lambda env: 1) + ) + self.assertEqual(results, [10, 10, 10]) + self.assertEqual(results2, [(0, 10), (1, 10), (2, 10)]) + self.assertEqual(results3, [[1, 1], [1, 1], [1, 1]]) + algo.stop() def test_action_clipping(self): action_space = gym.spaces.Box(-2.0, 1.0, (3,)) diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 69dbce3212f0..457abba37f63 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -13,7 +13,6 @@ from ray.rllib.examples._old_api_stack.policy.episode_env_aware_policy import ( EpisodeEnvAwareAttentionPolicy, ) -from ray.rllib.models.tf.attention_net import GTrXLNet from ray.rllib.policy.rnn_sequencing import pad_batch_to_sequences_of_same_size from ray.rllib.policy.sample_batch import ( DEFAULT_POLICY_ID, @@ -23,7 +22,7 @@ from ray.rllib.policy.view_requirement import ViewRequirement from ray.rllib.utils.annotations import override from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME -from ray.rllib.utils.test_utils import framework_iterator, check +from ray.rllib.utils.test_utils import check class MyCallbacks(DefaultCallbacks): @@ -64,42 +63,39 @@ def test_traj_view_normal_case(self): ) ) - for _ in framework_iterator(config): - algo = config.build() - policy = algo.get_policy() - view_req_model = policy.model.view_requirements - view_req_policy = policy.view_requirements - assert len(view_req_model) == 1, view_req_model - assert len(view_req_policy) == 12, view_req_policy - for key in [ - SampleBatch.OBS, - SampleBatch.ACTIONS, - SampleBatch.REWARDS, - SampleBatch.TERMINATEDS, - SampleBatch.TRUNCATEDS, - SampleBatch.NEXT_OBS, - SampleBatch.EPS_ID, - SampleBatch.AGENT_INDEX, - "weights", - ]: - assert key in view_req_policy - # None of the view cols has a special underlying data_col, - # except next-obs. - if key != SampleBatch.NEXT_OBS: - assert view_req_policy[key].data_col is None - else: - assert view_req_policy[key].data_col == SampleBatch.OBS - assert view_req_policy[key].shift == 1 - rollout_worker = algo.env_runner - sample_batch = rollout_worker.sample() - sample_batch = convert_ma_batch_to_sample_batch(sample_batch) - expected_count = ( - config.num_envs_per_env_runner * config.rollout_fragment_length - ) - assert sample_batch.count == expected_count - for v in sample_batch.values(): - assert len(v) == expected_count - algo.stop() + algo = config.build() + policy = algo.get_policy() + view_req_model = policy.model.view_requirements + view_req_policy = policy.view_requirements + assert len(view_req_model) == 1, view_req_model + assert len(view_req_policy) == 12, view_req_policy + for key in [ + SampleBatch.OBS, + SampleBatch.ACTIONS, + SampleBatch.REWARDS, + SampleBatch.TERMINATEDS, + SampleBatch.TRUNCATEDS, + SampleBatch.NEXT_OBS, + SampleBatch.EPS_ID, + SampleBatch.AGENT_INDEX, + "weights", + ]: + assert key in view_req_policy + # None of the view cols has a special underlying data_col, + # except next-obs. + if key != SampleBatch.NEXT_OBS: + assert view_req_policy[key].data_col is None + else: + assert view_req_policy[key].data_col == SampleBatch.OBS + assert view_req_policy[key].shift == 1 + rollout_worker = algo.env_runner + sample_batch = rollout_worker.sample() + sample_batch = convert_ma_batch_to_sample_batch(sample_batch) + expected_count = config.num_envs_per_env_runner * config.rollout_fragment_length + assert sample_batch.count == expected_count + for v in sample_batch.values(): + assert len(v) == expected_count + algo.stop() def test_traj_view_lstm_prev_actions_and_rewards(self): """Tests, whether Policy/Model return correct LSTM ViewRequirements.""" @@ -121,113 +117,67 @@ def test_traj_view_lstm_prev_actions_and_rewards(self): .env_runners(create_env_on_local_worker=True) ) - for _ in framework_iterator(config): - algo = config.build() - policy = algo.get_policy() - view_req_model = policy.model.view_requirements - view_req_policy = policy.view_requirements - # 7=obs, prev-a + r, 2x state-in, 2x state-out. - assert len(view_req_model) == 7, view_req_model - assert len(view_req_policy) == 23, (len(view_req_policy), view_req_policy) - for key in [ - SampleBatch.OBS, - SampleBatch.ACTIONS, - SampleBatch.REWARDS, - SampleBatch.TERMINATEDS, - SampleBatch.TRUNCATEDS, + algo = config.build() + policy = algo.get_policy() + view_req_model = policy.model.view_requirements + view_req_policy = policy.view_requirements + # 7=obs, prev-a + r, 2x state-in, 2x state-out. + assert len(view_req_model) == 7, view_req_model + assert len(view_req_policy) == 23, (len(view_req_policy), view_req_policy) + for key in [ + SampleBatch.OBS, + SampleBatch.ACTIONS, + SampleBatch.REWARDS, + SampleBatch.TERMINATEDS, + SampleBatch.TRUNCATEDS, + SampleBatch.NEXT_OBS, + SampleBatch.VF_PREDS, + SampleBatch.PREV_ACTIONS, + SampleBatch.PREV_REWARDS, + "advantages", + "value_targets", + SampleBatch.ACTION_DIST_INPUTS, + SampleBatch.ACTION_LOGP, + ]: + assert key in view_req_policy + + if key == SampleBatch.PREV_ACTIONS: + assert view_req_policy[key].data_col == SampleBatch.ACTIONS + assert view_req_policy[key].shift == -1 + elif key == SampleBatch.PREV_REWARDS: + assert view_req_policy[key].data_col == SampleBatch.REWARDS + assert view_req_policy[key].shift == -1 + elif key not in [ SampleBatch.NEXT_OBS, - SampleBatch.VF_PREDS, SampleBatch.PREV_ACTIONS, SampleBatch.PREV_REWARDS, - "advantages", - "value_targets", - SampleBatch.ACTION_DIST_INPUTS, - SampleBatch.ACTION_LOGP, ]: - assert key in view_req_policy - - if key == SampleBatch.PREV_ACTIONS: - assert view_req_policy[key].data_col == SampleBatch.ACTIONS - assert view_req_policy[key].shift == -1 - elif key == SampleBatch.PREV_REWARDS: - assert view_req_policy[key].data_col == SampleBatch.REWARDS - assert view_req_policy[key].shift == -1 - elif key not in [ - SampleBatch.NEXT_OBS, - SampleBatch.PREV_ACTIONS, - SampleBatch.PREV_REWARDS, - ]: - assert view_req_policy[key].data_col is None - else: - assert view_req_policy[key].data_col == SampleBatch.OBS - assert view_req_policy[key].shift == 1 - - rollout_worker = algo.env_runner - sample_batch = rollout_worker.sample() - sample_batch = convert_ma_batch_to_sample_batch(sample_batch) - - # Rollout fragment length should be auto-computed to 2000: - # 2 workers, 1 env per worker, train batch size=4000 -> 2000 per worker. - self.assertEqual(sample_batch.count, 2000, "ppo rollout count != 2000") - self.assertEqual(sum(sample_batch["seq_lens"]), sample_batch.count) - self.assertEqual( - len(sample_batch["seq_lens"]), sample_batch["state_in_0"].shape[0] - ) + assert view_req_policy[key].data_col is None + else: + assert view_req_policy[key].data_col == SampleBatch.OBS + assert view_req_policy[key].shift == 1 - # check if non-zero state_ins are pointing to the correct state_outs - seq_counters = np.cumsum(sample_batch["seq_lens"]) - for i in range(sample_batch["state_in_0"].shape[0]): - state_in = sample_batch["state_in_0"][i] - if np.any(state_in != 0): - # non-zero state-in should be one of th state_outs. - state_out_ind = seq_counters[i - 1] - 1 - check(sample_batch["state_out_0"][state_out_ind], state_in) - algo.stop() - - def test_traj_view_attention_net(self): - config = ( - ppo.PPOConfig() - # Batch-norm models have not been migrated to the RL Module API yet. - .api_stack(enable_rl_module_and_learner=False) - .environment( - "ray.rllib.examples.envs.classes.debug_counter_env.DebugCounterEnv", - env_config={"config": {"start_at_t": 1}}, # first obs is [1.0] - ) - .env_runners(num_env_runners=0) - .callbacks(MyCallbacks) - # Setup attention net. - .training( - model={ - "custom_model": GTrXLNet, - "custom_model_config": { - "num_transformer_units": 1, - "attention_dim": 64, - "num_heads": 2, - "memory_inference": 50, - "memory_training": 50, - "head_dim": 32, - "ff_hidden_dim": 32, - }, - "max_seq_len": 50, - }, - # Test with odd batch numbers. - train_batch_size=1031, - sgd_minibatch_size=201, - num_sgd_iter=5, - ) + rollout_worker = algo.env_runner + sample_batch = rollout_worker.sample() + sample_batch = convert_ma_batch_to_sample_batch(sample_batch) + + # Rollout fragment length should be auto-computed to 2000: + # 2 workers, 1 env per worker, train batch size=4000 -> 2000 per worker. + self.assertEqual(sample_batch.count, 2000, "ppo rollout count != 2000") + self.assertEqual(sum(sample_batch["seq_lens"]), sample_batch.count) + self.assertEqual( + len(sample_batch["seq_lens"]), sample_batch["state_in_0"].shape[0] ) - for _ in framework_iterator(config, frameworks="tf2"): - algo = config.build() - rw = algo.env_runner - sample = rw.sample() - assert sample.count == algo.config.get_rollout_fragment_length() - results = algo.train() - assert ( - results[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"] - == config["train_batch_size"] - ) - algo.stop() + # check if non-zero state_ins are pointing to the correct state_outs + seq_counters = np.cumsum(sample_batch["seq_lens"]) + for i in range(sample_batch["state_in_0"].shape[0]): + state_in = sample_batch["state_in_0"][i] + if np.any(state_in != 0): + # non-zero state-in should be one of th state_outs. + state_out_ind = seq_counters[i - 1] - 1 + check(sample_batch["state_out_0"][state_out_ind], state_in) + algo.stop() def test_traj_view_next_action(self): action_space = Discrete(2) diff --git a/rllib/examples/offline_rl/offline_rl.py b/rllib/examples/offline_rl/offline_rl.py index 3442d2b130b5..6d19252bca27 100644 --- a/rllib/examples/offline_rl/offline_rl.py +++ b/rllib/examples/offline_rl/offline_rl.py @@ -107,7 +107,6 @@ num_iterations = 5 min_reward = -300 - # Test for torch framework (tf not implemented yet). cql_algorithm = cql.CQL(config=config) learnt = False for i in range(num_iterations): diff --git a/rllib/models/tests/test_action_distributions.py b/rllib/models/tests/test_action_distributions.py index 3ba9b540af11..6de0c1aa62a0 100644 --- a/rllib/models/tests/test_action_distributions.py +++ b/rllib/models/tests/test_action_distributions.py @@ -5,15 +5,6 @@ import tree # pip install dm_tree import unittest -from ray.rllib.models.tf.tf_action_dist import ( - Beta, - Categorical, - DiagGaussian, - GumbelSoftmax, - MultiActionDistribution, - MultiCategorical, - SquashedGaussian, -) from ray.rllib.models.torch.torch_action_dist import ( TorchBeta, TorchCategorical, @@ -30,7 +21,7 @@ SMALL_NUMBER, LARGE_INTEGER, ) -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check tf1, tf, tfv = try_import_tf() torch, _ = try_import_torch() @@ -81,10 +72,7 @@ def _stability_test( dist = distribution_cls(inputs, {}, **(extra_kwargs or {})) for _ in range(100): sample = dist.sample() - if fw != "tf": - sample_check = sample.numpy() - else: - sample_check = sess.run(sample) + sample_check = sample.numpy() assert not np.any(np.isnan(sample_check)) assert np.all(np.isfinite(sample_check)) if bounds: @@ -97,10 +85,7 @@ def _stability_test( assert bounds[0] in sample_check assert bounds[1] in sample_check logp = dist.logp(sample) - if fw != "tf": - logp_check = logp.numpy() - else: - logp_check = sess.run(logp) + logp_check = logp.numpy() assert not np.any(np.isnan(logp_check)) assert np.all(np.isfinite(logp_check)) @@ -117,50 +102,45 @@ def test_categorical(self): inputs = inputs_space.sample() - for fw, sess in framework_iterator(session=True): - # Create the correct distribution object. - cls = Categorical if fw != "torch" else TorchCategorical - categorical = cls(inputs, {}) - - # Do a stability test using extreme NN outputs to see whether - # sampling and logp'ing result in NaN or +/-inf values. - self._stability_test( - cls, - inputs_space.shape, - fw=fw, - sess=sess, - bounds=(0, num_categories - 1), - ) + # Create the correct distribution object. + cls = TorchCategorical + categorical = cls(inputs, {}) + + # Do a stability test using extreme NN outputs to see whether + # sampling and logp'ing result in NaN or +/-inf values. + self._stability_test( + cls, + inputs_space.shape, + fw="torch", + sess=None, + bounds=(0, num_categories - 1), + ) - # Batch of size=3 and deterministic (True). - expected = np.transpose(np.argmax(inputs, axis=-1)) - # Sample, expect always max value - # (max likelihood for deterministic draw). - out = categorical.deterministic_sample() - check(out, expected) - - # Batch of size=3 and non-deterministic -> expect roughly the mean. - out = categorical.sample() - check( - tf.reduce_mean(out) if fw != "torch" else torch.mean(out.float()), - 1.0, - decimals=0, - ) + # Batch of size=3 and deterministic (True). + expected = np.transpose(np.argmax(inputs, axis=-1)) + # Sample, expect always max value + # (max likelihood for deterministic draw). + out = categorical.deterministic_sample() + check(out, expected) - # Test log-likelihood outputs. - probs = softmax(inputs) - values = values_space.sample() + # Batch of size=3 and non-deterministic -> expect roughly the mean. + out = categorical.sample() + check(torch.mean(out.float()), 1.0, decimals=0) - out = categorical.logp(values if fw != "torch" else torch.Tensor(values)) - expected = [] - for i in range(batch_size): - expected.append(np.sum(np.log(np.array(probs[i][values[i]])))) - check(out, expected, decimals=4) + # Test log-likelihood outputs. + probs = softmax(inputs) + values = values_space.sample() - # Test entropy outputs. - out = categorical.entropy() - expected_entropy = -np.sum(probs * np.log(probs), -1) - check(out, expected_entropy) + out = categorical.logp(torch.Tensor(values)) + expected = [] + for i in range(batch_size): + expected.append(np.sum(np.log(np.array(probs[i][values[i]])))) + check(out, expected, decimals=4) + + # Test entropy outputs. + out = categorical.entropy() + expected_entropy = -np.sum(probs * np.log(probs), -1) + check(out, expected_entropy) def test_multi_categorical(self): batch_size = 100 @@ -183,66 +163,59 @@ def test_multi_categorical(self): input_lengths = [num_categories] * num_sub_distributions inputs_split = np.split(inputs, num_sub_distributions, axis=1) - for fw, sess in framework_iterator(session=True): - # Create the correct distribution object. - cls = MultiCategorical if fw != "torch" else TorchMultiCategorical - multi_categorical = cls(inputs, None, input_lengths) - - # Do a stability test using extreme NN outputs to see whether - # sampling and logp'ing result in NaN or +/-inf values. - self._stability_test( - cls, - inputs_space.shape, - fw=fw, - sess=sess, - bounds=(0, num_categories - 1), - extra_kwargs={"input_lens": input_lengths}, - ) + # Create the correct distribution object. + cls = TorchMultiCategorical + multi_categorical = cls(inputs, None, input_lengths) + + # Do a stability test using extreme NN outputs to see whether + # sampling and logp'ing result in NaN or +/-inf values. + self._stability_test( + cls, + inputs_space.shape, + fw="torch", + sess=None, + bounds=(0, num_categories - 1), + extra_kwargs={"input_lens": input_lengths}, + ) - # Batch of size=3 and deterministic (True). - expected = np.transpose(np.argmax(inputs_split, axis=-1)) - # Sample, expect always max value - # (max likelihood for deterministic draw). - out = multi_categorical.deterministic_sample() - check(out, expected) - - # Batch of size=3 and non-deterministic -> expect roughly the mean. - out = multi_categorical.sample() - check( - tf.reduce_mean(out) if fw != "torch" else torch.mean(out.float()), - 1.0, - decimals=0, - ) + # Batch of size=3 and deterministic (True). + expected = np.transpose(np.argmax(inputs_split, axis=-1)) + # Sample, expect always max value + # (max likelihood for deterministic draw). + out = multi_categorical.deterministic_sample() + check(out, expected) + + # Batch of size=3 and non-deterministic -> expect roughly the mean. + out = multi_categorical.sample() + check(torch.mean(out.float()), 1.0, decimals=0) - # Test log-likelihood outputs. - probs = softmax(inputs_split) - values = values_space.sample() - - out = multi_categorical.logp( - values - if fw != "torch" - else [torch.Tensor(values[i]) for i in range(num_sub_distributions)] - ) # v in np.stack(values, 1)]) - expected = [] - for i in range(batch_size): - expected.append( - np.sum( - np.log( - np.array( - [ - probs[j][i][values[j][i]] - for j in range(num_sub_distributions) - ] - ) + # Test log-likelihood outputs. + probs = softmax(inputs_split) + values = values_space.sample() + + out = multi_categorical.logp( + [torch.Tensor(values[i]) for i in range(num_sub_distributions)] + ) + expected = [] + for i in range(batch_size): + expected.append( + np.sum( + np.log( + np.array( + [ + probs[j][i][values[j][i]] + for j in range(num_sub_distributions) + ] ) ) ) - check(out, expected, decimals=4) + ) + check(out, expected, decimals=4) - # Test entropy outputs. - out = multi_categorical.entropy() - expected_entropy = -np.sum(np.sum(probs * np.log(probs), 0), -1) - check(out, expected_entropy) + # Test entropy outputs. + out = multi_categorical.entropy() + expected_entropy = -np.sum(np.sum(probs * np.log(probs), 0), -1) + check(out, expected_entropy) def test_squashed_gaussian(self): """Tests the SquashedGaussian ActionDistribution for all frameworks.""" @@ -251,172 +224,138 @@ def test_squashed_gaussian(self): low, high = -2.0, 1.0 - for fw, sess in framework_iterator(session=True): - cls = SquashedGaussian if fw != "torch" else TorchSquashedGaussian - - # Do a stability test using extreme NN outputs to see whether - # sampling and logp'ing result in NaN or +/-inf values. - self._stability_test( - cls, input_space.shape, fw=fw, sess=sess, bounds=(low, high) - ) - - # Batch of size=n and deterministic. - inputs = input_space.sample() - means, _ = np.split(inputs, 2, axis=-1) - squashed_distribution = cls(inputs, {}, low=low, high=high) - expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low - # Sample n times, expect always mean value (deterministic draw). - out = squashed_distribution.deterministic_sample() - check(out, expected) - - # Batch of size=n and non-deterministic -> expect roughly the mean. - inputs = input_space.sample() - means, log_stds = np.split(inputs, 2, axis=-1) - squashed_distribution = cls(inputs, {}, low=low, high=high) - expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low - values = squashed_distribution.sample() - if sess: - values = sess.run(values) - else: - values = values.numpy() - self.assertTrue(np.max(values) <= high) - self.assertTrue(np.min(values) >= low) - - check(np.mean(values), expected.mean(), decimals=1) + cls = TorchSquashedGaussian - # Test log-likelihood outputs. - sampled_action_logp = squashed_distribution.logp( - values if fw != "torch" else torch.Tensor(values) - ) - if sess: - sampled_action_logp = sess.run(sampled_action_logp) - else: - sampled_action_logp = sampled_action_logp.numpy() - # Convert to parameters for distr. - stds = np.exp(np.clip(log_stds, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT)) - # Unsquash values, then get log-llh from regular gaussian. - # atanh_in = np.clip((values - low) / (high - low) * 2.0 - 1.0, - # -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER) - normed_values = (values - low) / (high - low) * 2.0 - 1.0 - save_normed_values = np.clip( - normed_values, -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER - ) - unsquashed_values = np.arctanh(save_normed_values) - log_prob_unsquashed = np.sum( - np.log(norm.pdf(unsquashed_values, means, stds)), -1 - ) - log_prob = log_prob_unsquashed - np.sum( - np.log(1 - np.tanh(unsquashed_values) ** 2), axis=-1 - ) - check(np.sum(sampled_action_logp), np.sum(log_prob), rtol=0.05) + # Do a stability test using extreme NN outputs to see whether + # sampling and logp'ing result in NaN or +/-inf values. + self._stability_test( + cls, input_space.shape, fw="torch", sess=None, bounds=(low, high) + ) - # NN output. - means = np.array( - [[0.1, 0.2, 0.3, 0.4, 50.0], [-0.1, -0.2, -0.3, -0.4, -1.0]] - ) - log_stds = np.array( - [[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]] - ) - squashed_distribution = cls( - inputs=np.concatenate([means, log_stds], axis=-1), - model={}, - low=low, - high=high, - ) - # Convert to parameters for distr. - stds = np.exp(log_stds) - # Values to get log-likelihoods for. - values = np.array( - [[0.9, 0.2, 0.4, -0.1, -1.05], [-0.9, -0.2, 0.4, -0.1, -1.05]] - ) + # Batch of size=n and deterministic. + inputs = input_space.sample() + means, _ = np.split(inputs, 2, axis=-1) + squashed_distribution = cls(inputs, {}, low=low, high=high) + expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low + # Sample n times, expect always mean value (deterministic draw). + out = squashed_distribution.deterministic_sample() + check(out, expected) + + # Batch of size=n and non-deterministic -> expect roughly the mean. + inputs = input_space.sample() + means, log_stds = np.split(inputs, 2, axis=-1) + squashed_distribution = cls(inputs, {}, low=low, high=high) + expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low + values = squashed_distribution.sample() + values = values.numpy() + self.assertTrue(np.max(values) <= high) + self.assertTrue(np.min(values) >= low) + + check(np.mean(values), expected.mean(), decimals=1) + + # Test log-likelihood outputs. + sampled_action_logp = squashed_distribution.logp(torch.Tensor(values)) + sampled_action_logp = sampled_action_logp.numpy() + # Convert to parameters for distr. + stds = np.exp(np.clip(log_stds, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT)) + # Unsquash values, then get log-llh from regular gaussian. + # atanh_in = np.clip((values - low) / (high - low) * 2.0 - 1.0, + # -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER) + normed_values = (values - low) / (high - low) * 2.0 - 1.0 + save_normed_values = np.clip( + normed_values, -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER + ) + unsquashed_values = np.arctanh(save_normed_values) + log_prob_unsquashed = np.sum( + np.log(norm.pdf(unsquashed_values, means, stds)), -1 + ) + log_prob = log_prob_unsquashed - np.sum( + np.log(1 - np.tanh(unsquashed_values) ** 2), axis=-1 + ) + check(np.sum(sampled_action_logp), np.sum(log_prob), rtol=0.05) + + # NN output. + means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0], [-0.1, -0.2, -0.3, -0.4, -1.0]]) + log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]]) + squashed_distribution = cls( + inputs=np.concatenate([means, log_stds], axis=-1), + model={}, + low=low, + high=high, + ) + # Convert to parameters for distr. + stds = np.exp(log_stds) + # Values to get log-likelihoods for. + values = np.array( + [[0.9, 0.2, 0.4, -0.1, -1.05], [-0.9, -0.2, 0.4, -0.1, -1.05]] + ) - # Unsquash values, then get log-llh from regular gaussian. - unsquashed_values = np.arctanh((values - low) / (high - low) * 2.0 - 1.0) - log_prob_unsquashed = np.sum( - np.log(norm.pdf(unsquashed_values, means, stds)), -1 - ) - log_prob = log_prob_unsquashed - np.sum( - np.log(1 - np.tanh(unsquashed_values) ** 2), axis=-1 - ) + # Unsquash values, then get log-llh from regular gaussian. + unsquashed_values = np.arctanh((values - low) / (high - low) * 2.0 - 1.0) + log_prob_unsquashed = np.sum( + np.log(norm.pdf(unsquashed_values, means, stds)), -1 + ) + log_prob = log_prob_unsquashed - np.sum( + np.log(1 - np.tanh(unsquashed_values) ** 2), axis=-1 + ) - outs = squashed_distribution.logp( - values if fw != "torch" else torch.Tensor(values) - ) - if sess: - outs = sess.run(outs) - check(outs, log_prob, decimals=4) + outs = squashed_distribution.logp(torch.Tensor(values)) + check(outs, log_prob, decimals=4) def test_diag_gaussian(self): """Tests the DiagGaussian ActionDistribution for all frameworks.""" input_space = Box(-2.0, 1.0, shape=(2000, 10)) input_space.seed(42) - for fw, sess in framework_iterator(session=True): - cls = DiagGaussian if fw != "torch" else TorchDiagGaussian - - # Do a stability test using extreme NN outputs to see whether - # sampling and logp'ing result in NaN or +/-inf values. - self._stability_test(cls, input_space.shape, fw=fw, sess=sess) - - # Batch of size=n and deterministic. - inputs = input_space.sample() - means, _ = np.split(inputs, 2, axis=-1) - diag_distribution = cls(inputs, {}) - expected = means - # Sample n times, expect always mean value (deterministic draw). - out = diag_distribution.deterministic_sample() - check(out, expected) - - # Batch of size=n and non-deterministic -> expect roughly the mean. - inputs = input_space.sample() - means, log_stds = np.split(inputs, 2, axis=-1) - diag_distribution = cls(inputs, {}) - expected = means - values = diag_distribution.sample() - if sess: - values = sess.run(values) - else: - values = values.numpy() - check(np.mean(values), expected.mean(), decimals=1) - - # Test log-likelihood outputs. - sampled_action_logp = diag_distribution.logp( - values if fw != "torch" else torch.Tensor(values) - ) - if sess: - sampled_action_logp = sess.run(sampled_action_logp) - else: - sampled_action_logp = sampled_action_logp.numpy() - - # NN output. - means = np.array( - [[0.1, 0.2, 0.3, 0.4, 50.0], [-0.1, -0.2, -0.3, -0.4, -1.0]], - dtype=np.float32, - ) - log_stds = np.array( - [[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]], - dtype=np.float32, - ) + cls = TorchDiagGaussian + + # Do a stability test using extreme NN outputs to see whether + # sampling and logp'ing result in NaN or +/-inf values. + self._stability_test(cls, input_space.shape, fw="torch") + + # Batch of size=n and deterministic. + inputs = input_space.sample() + means, _ = np.split(inputs, 2, axis=-1) + diag_distribution = cls(inputs, {}) + expected = means + # Sample n times, expect always mean value (deterministic draw). + out = diag_distribution.deterministic_sample() + check(out, expected) + + # Batch of size=n and non-deterministic -> expect roughly the mean. + inputs = input_space.sample() + means, log_stds = np.split(inputs, 2, axis=-1) + diag_distribution = cls(inputs, {}) + expected = means + values = diag_distribution.sample() + values = values.numpy() + check(np.mean(values), expected.mean(), decimals=1) + + # NN output. + means = np.array( + [[0.1, 0.2, 0.3, 0.4, 50.0], [-0.1, -0.2, -0.3, -0.4, -1.0]], + dtype=np.float32, + ) + log_stds = np.array( + [[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]], + dtype=np.float32, + ) - diag_distribution = cls( - inputs=np.concatenate([means, log_stds], axis=-1), model={} - ) - # Convert to parameters for distr. - stds = np.exp(log_stds) - # Values to get log-likelihoods for. - values = np.array( - [[0.9, 0.2, 0.4, -0.1, -1.05], [-0.9, -0.2, 0.4, -0.1, -1.05]] - ) + diag_distribution = cls( + inputs=np.concatenate([means, log_stds], axis=-1), model={} + ) + # Convert to parameters for distr. + stds = np.exp(log_stds) + # Values to get log-likelihoods for. + values = np.array( + [[0.9, 0.2, 0.4, -0.1, -1.05], [-0.9, -0.2, 0.4, -0.1, -1.05]] + ) - # get log-llh from regular gaussian. - log_prob = np.sum(np.log(norm.pdf(values, means, stds)), -1) + # get log-llh from regular gaussian. + log_prob = np.sum(np.log(norm.pdf(values, means, stds)), -1) - outs = diag_distribution.logp( - values if fw != "torch" else torch.Tensor(values) - ) - if sess: - outs = sess.run(outs) - check(outs, log_prob, decimals=4) + outs = diag_distribution.logp(torch.Tensor(values)) + check(outs, log_prob, decimals=4) def test_beta(self): input_space = Box(-2.0, 1.0, shape=(2000, 10)) @@ -425,81 +364,41 @@ def test_beta(self): plain_beta_value_space = Box(0.0, 1.0, shape=(2000, 5)) plain_beta_value_space.seed(42) - for fw, sess in framework_iterator(session=True): - cls = TorchBeta if fw == "torch" else Beta - inputs = input_space.sample() - beta_distribution = cls(inputs, {}, low=low, high=high) - - inputs = beta_distribution.inputs - if sess: - inputs = sess.run(inputs) - else: - inputs = inputs.numpy() - alpha, beta_ = np.split(inputs, 2, axis=-1) - - # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] - expected = (1.0 / (1.0 + beta_ / alpha)) * (high - low) + low - # Sample n times, expect always mean value (deterministic draw). - out = beta_distribution.deterministic_sample() - check(out, expected, rtol=0.01) - - # Batch of size=n and non-deterministic -> expect roughly the mean. - values = beta_distribution.sample() - if sess: - values = sess.run(values) - else: - values = values.numpy() - self.assertTrue(np.max(values) <= high) - self.assertTrue(np.min(values) >= low) - - check(np.mean(values), expected.mean(), decimals=1) - - # Test log-likelihood outputs (against scipy). - inputs = input_space.sample() - beta_distribution = cls(inputs, {}, low=low, high=high) - inputs = beta_distribution.inputs - if sess: - inputs = sess.run(inputs) - else: - inputs = inputs.numpy() - alpha, beta_ = np.split(inputs, 2, axis=-1) - - values = plain_beta_value_space.sample() - values_scaled = values * (high - low) + low - if fw == "torch": - values_scaled = torch.Tensor(values_scaled) - print(values_scaled) - out = beta_distribution.logp(values_scaled) - check(out, np.sum(np.log(beta.pdf(values, alpha, beta_)), -1), rtol=0.01) - - # TODO(sven): Test entropy outputs (against scipy). - - def test_gumbel_softmax(self): - """Tests the GumbelSoftmax ActionDistribution (tf + eager only).""" - for fw, sess in framework_iterator(frameworks=("tf2", "tf"), session=True): - batch_size = 1000 - num_categories = 5 - input_space = Box(-1.0, 1.0, shape=(batch_size, num_categories)) - input_space.seed(42) - - # Batch of size=n and deterministic. - inputs = input_space.sample() - gumbel_softmax = GumbelSoftmax(inputs, {}, temperature=1.0) - - expected = softmax(inputs) - # Sample n times, expect always mean value (deterministic draw). - out = gumbel_softmax.deterministic_sample() - check(out, expected) - - # Batch of size=n and non-deterministic -> expect roughly that - # the max-likelihood (argmax) ints are output (most of the time). - inputs = input_space.sample() - gumbel_softmax = GumbelSoftmax(inputs, {}, temperature=1.0) - expected_mean = np.mean(np.argmax(inputs, -1)).astype(np.float32) - outs = gumbel_softmax.sample() - if sess: - outs = sess.run(outs) - check(np.mean(np.argmax(outs, -1)), expected_mean, rtol=0.08) + cls = TorchBeta + inputs = input_space.sample() + beta_distribution = cls(inputs, {}, low=low, high=high) + + inputs = beta_distribution.inputs + inputs = inputs.numpy() + alpha, beta_ = np.split(inputs, 2, axis=-1) + + # Mean for a Beta distribution: 1 / [1 + (beta/alpha)] + expected = (1.0 / (1.0 + beta_ / alpha)) * (high - low) + low + # Sample n times, expect always mean value (deterministic draw). + out = beta_distribution.deterministic_sample() + check(out, expected, rtol=0.01) + + # Batch of size=n and non-deterministic -> expect roughly the mean. + values = beta_distribution.sample() + values = values.numpy() + self.assertTrue(np.max(values) <= high) + self.assertTrue(np.min(values) >= low) + + check(np.mean(values), expected.mean(), decimals=1) + + # Test log-likelihood outputs (against scipy). + inputs = input_space.sample() + beta_distribution = cls(inputs, {}, low=low, high=high) + inputs = beta_distribution.inputs + inputs = inputs.numpy() + alpha, beta_ = np.split(inputs, 2, axis=-1) + + values = plain_beta_value_space.sample() + values_scaled = values * (high - low) + low + values_scaled = torch.Tensor(values_scaled) + print(values_scaled) + out = beta_distribution.logp(values_scaled) + check(out, np.sum(np.log(beta.pdf(values, alpha, beta_)), -1), rtol=0.01) def test_multi_action_distribution(self): """Tests the MultiActionDistribution (across all frameworks).""" @@ -539,152 +438,130 @@ def test_multi_action_distribution(self): ) value_space.seed(42) - for fw, sess in framework_iterator(session=True): - if fw == "torch": - cls = TorchMultiActionDistribution - child_distr_cls = [ - TorchCategorical, - TorchDiagGaussian, - partial(TorchBeta, low=low, high=high), - ] - else: - cls = MultiActionDistribution - child_distr_cls = [ - Categorical, - DiagGaussian, - partial(Beta, low=low, high=high), - ] - - inputs = list(input_space.sample()) - distr = cls( - np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), - model={}, - action_space=value_space, - child_distributions=child_distr_cls, - input_lens=[4, 6, 4], - ) + cls = TorchMultiActionDistribution + child_distr_cls = [ + TorchCategorical, + TorchDiagGaussian, + partial(TorchBeta, low=low, high=high), + ] - # Adjust inputs for the Beta distr just as Beta itself does. - inputs[2]["a"] = np.clip( - inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER) - ) - inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 - # Sample deterministically. - expected_det = [ - np.argmax(inputs[0], axis=-1), - inputs[1][:, :3], # [:3]=Mean values. - # Mean for a Beta distribution: - # 1 / [1 + (beta/alpha)] * range + low - (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, 0:2])) - * (high - low) - + low, - ] - out = distr.deterministic_sample() - if sess: - out = sess.run(out) - check(out[0], expected_det[0]) - check(out[1], expected_det[1]) - check(out[2]["a"], expected_det[2]) - - # Stochastic sampling -> expect roughly the mean. - inputs = list(input_space.sample()) - # Fix categorical inputs (not needed for distribution itself, but - # for our expectation calculations). - inputs[0] = softmax(inputs[0], -1) - # Fix std inputs (shouldn't be too large for this test). - inputs[1][:, 3:] = std_space.sample() - # Adjust inputs for the Beta distr just as Beta itself does. - inputs[2]["a"] = np.clip( - inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER) - ) - inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 - distr = cls( - np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), - model={}, - action_space=value_space, - child_distributions=child_distr_cls, - input_lens=[4, 6, 4], - ) - expected_mean = [ - np.mean(np.sum(inputs[0] * np.array([0, 1, 2, 3]), -1)), - inputs[1][:, :3], # [:3]=Mean values. - # Mean for a Beta distribution: - # 1 / [1 + (beta/alpha)] * range + low - (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, :2])) - * (high - low) - + low, - ] - out = distr.sample() - if sess: - out = sess.run(out) - out = list(out) - if fw == "torch": - out[0] = out[0].numpy() - out[1] = out[1].numpy() - out[2]["a"] = out[2]["a"].numpy() - check(np.mean(out[0]), expected_mean[0], decimals=1) - check(np.mean(out[1], 0), np.mean(expected_mean[1], 0), decimals=1) - check(np.mean(out[2]["a"], 0), np.mean(expected_mean[2], 0), decimals=1) - - # Test log-likelihood outputs. - # Make sure beta-values are within 0.0 and 1.0 for the numpy - # calculation (which doesn't have scaling). - inputs = list(input_space.sample()) - # Adjust inputs for the Beta distr just as Beta itself does. - inputs[2]["a"] = np.clip( - inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER) - ) - inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 - distr = cls( - np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), - model={}, - action_space=value_space, - child_distributions=child_distr_cls, - input_lens=[4, 6, 4], - ) - inputs[0] = softmax(inputs[0], -1) - values = list(value_space.sample()) - log_prob_beta = np.log( - beta.pdf(values[2]["a"], inputs[2]["a"][:, :2], inputs[2]["a"][:, 2:]) - ) - # Now do the up-scaling for [2] (beta values) to be between - # low/high. - values[2]["a"] = values[2]["a"] * (high - low) + low - inputs[1][:, 3:] = np.exp(inputs[1][:, 3:]) - expected_log_llh = np.sum( - np.concatenate( - [ - np.expand_dims( - np.log([i[values[0][j]] for j, i in enumerate(inputs[0])]), - -1, - ), - np.log(norm.pdf(values[1], inputs[1][:, :3], inputs[1][:, 3:])), - log_prob_beta, - ], - -1, - ), + inputs = list(input_space.sample()) + distr = cls( + np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), + model={}, + action_space=value_space, + child_distributions=child_distr_cls, + input_lens=[4, 6, 4], + ) + + # Adjust inputs for the Beta distr just as Beta itself does. + inputs[2]["a"] = np.clip( + inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER) + ) + inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 + # Sample deterministically. + expected_det = [ + np.argmax(inputs[0], axis=-1), + inputs[1][:, :3], # [:3]=Mean values. + # Mean for a Beta distribution: + # 1 / [1 + (beta/alpha)] * range + low + (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, 0:2])) + * (high - low) + + low, + ] + out = distr.deterministic_sample() + check(out[0], expected_det[0]) + check(out[1], expected_det[1]) + check(out[2]["a"], expected_det[2]) + + # Stochastic sampling -> expect roughly the mean. + inputs = list(input_space.sample()) + # Fix categorical inputs (not needed for distribution itself, but + # for our expectation calculations). + inputs[0] = softmax(inputs[0], -1) + # Fix std inputs (shouldn't be too large for this test). + inputs[1][:, 3:] = std_space.sample() + # Adjust inputs for the Beta distr just as Beta itself does. + inputs[2]["a"] = np.clip( + inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER) + ) + inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 + distr = cls( + np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), + model={}, + action_space=value_space, + child_distributions=child_distr_cls, + input_lens=[4, 6, 4], + ) + expected_mean = [ + np.mean(np.sum(inputs[0] * np.array([0, 1, 2, 3]), -1)), + inputs[1][:, :3], # [:3]=Mean values. + # Mean for a Beta distribution: + # 1 / [1 + (beta/alpha)] * range + low + (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, :2])) * (high - low) + + low, + ] + out = distr.sample() + out = list(out) + out[0] = out[0].numpy() + out[1] = out[1].numpy() + out[2]["a"] = out[2]["a"].numpy() + check(np.mean(out[0]), expected_mean[0], decimals=1) + check(np.mean(out[1], 0), np.mean(expected_mean[1], 0), decimals=1) + check(np.mean(out[2]["a"], 0), np.mean(expected_mean[2], 0), decimals=1) + + # Test log-likelihood outputs. + # Make sure beta-values are within 0.0 and 1.0 for the numpy + # calculation (which doesn't have scaling). + inputs = list(input_space.sample()) + # Adjust inputs for the Beta distr just as Beta itself does. + inputs[2]["a"] = np.clip( + inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER) + ) + inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 + distr = cls( + np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), + model={}, + action_space=value_space, + child_distributions=child_distr_cls, + input_lens=[4, 6, 4], + ) + inputs[0] = softmax(inputs[0], -1) + values = list(value_space.sample()) + log_prob_beta = np.log( + beta.pdf(values[2]["a"], inputs[2]["a"][:, :2], inputs[2]["a"][:, 2:]) + ) + # Now do the up-scaling for [2] (beta values) to be between + # low/high. + values[2]["a"] = values[2]["a"] * (high - low) + low + inputs[1][:, 3:] = np.exp(inputs[1][:, 3:]) + expected_log_llh = np.sum( + np.concatenate( + [ + np.expand_dims( + np.log([i[values[0][j]] for j, i in enumerate(inputs[0])]), + -1, + ), + np.log(norm.pdf(values[1], inputs[1][:, :3], inputs[1][:, 3:])), + log_prob_beta, + ], -1, - ) + ), + -1, + ) - values[0] = np.expand_dims(values[0], -1) - if fw == "torch": - values = tree.map_structure(lambda s: torch.Tensor(s), values) - # Test all flattened input. - concat = np.concatenate(tree.flatten(values), -1).astype(np.float32) - out = distr.logp(concat) - if sess: - out = sess.run(out) - check(out, expected_log_llh, atol=15) - # Test structured input. - out = distr.logp(values) - if sess: - out = sess.run(out) - check(out, expected_log_llh, atol=15) - # Test flattened input. - out = distr.logp(tree.flatten(values)) - if sess: - out = sess.run(out) - check(out, expected_log_llh, atol=15) + values[0] = np.expand_dims(values[0], -1) + values = tree.map_structure(lambda s: torch.Tensor(s), values) + # Test all flattened input. + concat = np.concatenate(tree.flatten(values), -1).astype(np.float32) + out = distr.logp(concat) + check(out, expected_log_llh, atol=15) + # Test structured input. + out = distr.logp(values) + check(out, expected_log_llh, atol=15) + # Test flattened input. + out = distr.logp(tree.flatten(values)) + check(out, expected_log_llh, atol=15) if __name__ == "__main__": diff --git a/rllib/models/tests/test_attention_nets.py b/rllib/models/tests/test_attention_nets.py index a12ef8bf05fa..1ccc216aec3c 100644 --- a/rllib/models/tests/test_attention_nets.py +++ b/rllib/models/tests/test_attention_nets.py @@ -13,7 +13,6 @@ EPISODE_RETURN_MEAN, NUM_ENV_STEPS_SAMPLED_LIFETIME, ) -from ray.rllib.utils.test_utils import framework_iterator class TestAttentionNets(unittest.TestCase): @@ -75,12 +74,11 @@ def test_attention_nets_w_prev_actions_and_prev_rewards(self): "rollout_fragment_length": 100, "num_env_runners": 1, } - for _ in framework_iterator(config): - tune.Tuner( - "PPO", - param_space=config, - run_config=air.RunConfig(stop={TRAINING_ITERATION: 1}, verbose=1), - ).fit() + tune.Tuner( + "PPO", + param_space=config, + run_config=air.RunConfig(stop={TRAINING_ITERATION: 1}, verbose=1), + ).fit() def test_ppo_attention_net_learning(self): ModelCatalog.register_custom_model("attention_net", GTrXLNet) diff --git a/rllib/models/tests/test_conv2d_default_stacks.py b/rllib/models/tests/test_conv2d_default_stacks.py index 890ec45c6c0f..4cbafb7adbd5 100644 --- a/rllib/models/tests/test_conv2d_default_stacks.py +++ b/rllib/models/tests/test_conv2d_default_stacks.py @@ -4,11 +4,9 @@ from ray.rllib.models.catalog import ModelCatalog, MODEL_DEFAULTS from ray.rllib.models.tf.visionnet import VisionNetwork from ray.rllib.models.torch.visionnet import VisionNetwork as TorchVision -from ray.rllib.utils.framework import try_import_torch, try_import_tf -from ray.rllib.utils.test_utils import framework_iterator +from ray.rllib.utils.framework import try_import_torch torch, nn = try_import_torch() -tf1, tf, tfv = try_import_tf() class TestConv2DDefaultStacks(unittest.TestCase): @@ -27,20 +25,14 @@ def test_conv2d_default_stacks(self): for shape in shapes: print(f"shape={shape}") obs_space = gym.spaces.Box(-1.0, 1.0, shape=shape) - for fw in framework_iterator(): - model = ModelCatalog.get_model_v2( - obs_space, action_space, 2, MODEL_DEFAULTS.copy(), framework=fw - ) - self.assertTrue(isinstance(model, (VisionNetwork, TorchVision))) - if fw == "torch": - output, _ = model( - {"obs": torch.from_numpy(obs_space.sample()[None])} - ) - else: - output, _ = model({"obs": obs_space.sample()[None]}) - # B x [action logits] - self.assertTrue(output.shape == (1, 2)) - print("ok") + model = ModelCatalog.get_model_v2( + obs_space, action_space, 2, MODEL_DEFAULTS.copy(), framework="torch" + ) + self.assertTrue(isinstance(model, (VisionNetwork, TorchVision))) + output, _ = model({"obs": torch.from_numpy(obs_space.sample()[None])}) + # B x [action logits] + self.assertTrue(output.shape == (1, 2)) + print("ok") if __name__ == "__main__": diff --git a/rllib/models/tests/test_lstms.py b/rllib/models/tests/test_lstms.py index 53351f4ab506..c8d204b395e5 100644 --- a/rllib/models/tests/test_lstms.py +++ b/rllib/models/tests/test_lstms.py @@ -6,7 +6,6 @@ from ray.air.constants import TRAINING_ITERATION from ray.rllib.algorithms import ppo from ray.rllib.examples.envs.classes.random_env import RandomEnv -from ray.rllib.utils.test_utils import framework_iterator class TestLSTMs(unittest.TestCase): @@ -63,12 +62,11 @@ def test_lstm_w_prev_action_and_prev_reward(self): ) ) - for _ in framework_iterator(config): - tune.Tuner( - "PPO", - param_space=config.to_dict(), - run_config=air.RunConfig(stop={TRAINING_ITERATION: 1}, verbose=1), - ).fit() + tune.Tuner( + "PPO", + param_space=config.to_dict(), + run_config=air.RunConfig(stop={TRAINING_ITERATION: 1}, verbose=1), + ).fit() if __name__ == "__main__": diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index 05d736945ed7..64b0836caec6 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -20,7 +20,6 @@ check, check_compute_single_action, check_train_results, - framework_iterator, ) from ray.rllib.utils.framework import try_import_tf @@ -60,12 +59,11 @@ def test_rlms_and_preprocessing(self): .experimental(_disable_preprocessor_api=True) ) - for _ in framework_iterator(config, frameworks=("torch", "tf2")): - algo = config.build() - results = algo.train() - check_train_results(results) - check_compute_single_action(algo) - algo.stop() + algo = config.build() + results = algo.train() + check_train_results(results) + check_compute_single_action(algo) + algo.stop() def test_preprocessing_disabled_modelv2(self): config = ( @@ -107,15 +105,13 @@ def test_preprocessing_disabled_modelv2(self): # input space. num_iterations = 1 - # Only supported for tf so far. - for _ in framework_iterator(config): - algo = config.build() - for i in range(num_iterations): - results = algo.train() - check_train_results(results) - print(results) - check_compute_single_action(algo) - algo.stop() + algo = config.build() + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + check_compute_single_action(algo) + algo.stop() def test_gym_preprocessors(self): p1 = ModelCatalog.get_preprocessor(gym.make("CartPole-v1")) diff --git a/rllib/policy/tests/test_compute_log_likelihoods.py b/rllib/policy/tests/test_compute_log_likelihoods.py index c13d0bbfd561..9a60bab41603 100644 --- a/rllib/policy/tests/test_compute_log_likelihoods.py +++ b/rllib/policy/tests/test_compute_log_likelihoods.py @@ -7,18 +7,14 @@ import ray.rllib.algorithms.dqn as dqn import ray.rllib.algorithms.ppo as ppo import ray.rllib.algorithms.sac as sac -from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.numpy import MAX_LOG_NN_OUTPUT, MIN_LOG_NN_OUTPUT, fc, one_hot -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check -tf1, tf, tfv = try_import_tf() - -def _get_expected_logp(fw, vars, obs_batch, a, layer_key, logp_func=None): +def _get_expected_logp(vars, obs_batch, a, layer_key, logp_func=None): """Get the expected logp for the given obs_batch and action. Args: - fw: Framework ("tf" or "torch"). vars: The ModelV2 weights. obs_batch: The observation batch. a: The action batch. @@ -28,29 +24,15 @@ def _get_expected_logp(fw, vars, obs_batch, a, layer_key, logp_func=None): Returns: The expected logp. """ - if fw != "torch": - if isinstance(vars, list): - expected_mean_logstd = fc( - fc(obs_batch, vars[layer_key[1][0]]), vars[layer_key[1][1]] - ) - else: - expected_mean_logstd = fc( - fc( - obs_batch, - vars["default_policy/{}_1/kernel".format(layer_key[0])], - ), - vars["default_policy/{}_out/kernel".format(layer_key[0])], - ) - else: - expected_mean_logstd = fc( - fc( - obs_batch, - vars["{}_model.0.weight".format(layer_key[2][0])], - framework=fw, - ), - vars["{}_model.0.weight".format(layer_key[2][1])], - framework=fw, - ) + expected_mean_logstd = fc( + fc( + obs_batch, + vars["{}_model.0.weight".format(layer_key[2][0])], + framework="torch", + ), + vars["{}_model.0.weight".format(layer_key[2][1])], + framework="torch", + ) mean, log_std = np.split(expected_mean_logstd, 2, axis=-1) if logp_func is None: expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std))) @@ -84,66 +66,64 @@ def do_test_log_likelihood( prev_r = None if prev_a is None else np.array(0.0) - # Test against all frameworks. - for fw in framework_iterator(config): - algo = config.build() - - policy = algo.get_policy() - vars = policy.get_weights() - # Sample n actions, then roughly check their logp against their - # counts. - num_actions = 1000 if not continuous else 50 - actions = [] - for _ in range(num_actions): - # Single action from single obs. - actions.append( - algo.compute_single_action( - obs_batch[0], - prev_action=prev_a, - prev_reward=prev_r, - explore=True, - # Do not unsquash actions - # (remain in normalized [-1.0; 1.0] space). - unsquash_action=False, - ) + algo = config.build() + + policy = algo.get_policy() + vars = policy.get_weights() + # Sample n actions, then roughly check their logp against their + # counts. + num_actions = 1000 if not continuous else 50 + actions = [] + for _ in range(num_actions): + # Single action from single obs. + actions.append( + algo.compute_single_action( + obs_batch[0], + prev_action=prev_a, + prev_reward=prev_r, + explore=True, + # Do not unsquash actions + # (remain in normalized [-1.0; 1.0] space). + unsquash_action=False, ) + ) - # Test all taken actions for their log-likelihoods vs expected values. - if continuous: - for idx in range(num_actions): - a = actions[idx] - - logp = policy.compute_log_likelihoods( - np.array([a]), - preprocessed_obs_batch, - prev_action_batch=np.array([prev_a]) if prev_a else None, - prev_reward_batch=np.array([prev_r]) if prev_r else None, - actions_normalized=True, - in_training=False, - ) + # Test all taken actions for their log-likelihoods vs expected values. + if continuous: + for idx in range(num_actions): + a = actions[idx] + + logp = policy.compute_log_likelihoods( + np.array([a]), + preprocessed_obs_batch, + prev_action_batch=np.array([prev_a]) if prev_a else None, + prev_reward_batch=np.array([prev_r]) if prev_r else None, + actions_normalized=True, + in_training=False, + ) - # The expected logp computation logic is overfitted to the ModelV2 - # stack and does not generalize to RLModule API. - if not config.enable_rl_module_and_learner: - expected_logp = _get_expected_logp( - fw, vars, obs_batch, a, layer_key, logp_func - ) - check(logp, expected_logp[0], rtol=0.2) - # Test all available actions for their logp values. - else: - for a in [0, 1, 2, 3]: - count = actions.count(a) - expected_prob = count / num_actions - logp = policy.compute_log_likelihoods( - np.array([a]), - preprocessed_obs_batch, - prev_action_batch=np.array([prev_a]) if prev_a else None, - prev_reward_batch=np.array([prev_r]) if prev_r else None, - in_training=False, + # The expected logp computation logic is overfitted to the ModelV2 + # stack and does not generalize to RLModule API. + if not config.enable_rl_module_and_learner: + expected_logp = _get_expected_logp( + vars, obs_batch, a, layer_key, logp_func ) + check(logp, expected_logp[0], rtol=0.2) + # Test all available actions for their logp values. + else: + for a in [0, 1, 2, 3]: + count = actions.count(a) + expected_prob = count / num_actions + logp = policy.compute_log_likelihoods( + np.array([a]), + preprocessed_obs_batch, + prev_action_batch=np.array([prev_a]) if prev_a else None, + prev_reward_batch=np.array([prev_r]) if prev_r else None, + in_training=False, + ) - if not config.enable_rl_module_and_learner: - check(np.exp(logp), expected_prob, atol=0.2) + if not config.enable_rl_module_and_learner: + check(np.exp(logp), expected_prob, atol=0.2) class TestComputeLogLikelihood(unittest.TestCase): diff --git a/rllib/policy/tests/test_export_checkpoint_and_model.py b/rllib/policy/tests/test_export_checkpoint_and_model.py index 32eaa654e00f..67f31b37e58c 100644 --- a/rllib/policy/tests/test_export_checkpoint_and_model.py +++ b/rllib/policy/tests/test_export_checkpoint_and_model.py @@ -8,11 +8,9 @@ import ray from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils.framework import try_import_tf, try_import_torch -from ray.rllib.utils.test_utils import framework_iterator +from ray.rllib.utils.framework import try_import_torch from ray.tune.registry import get_trainable_cls -tf1, tf, tfv = try_import_tf() torch, _ = try_import_torch() # Keep a set of all RLlib algos that support the RLModule API. @@ -25,7 +23,6 @@ def export_test( alg_name, framework="tf", multi_agent=False, - tf_expected_to_work=True, ): cls = get_trainable_cls(alg_name) config = cls.get_default_config() @@ -83,20 +80,6 @@ def export_test( assert results[0].shape in [(1, 2), (1, 3), (1, 256)], results[0].shape assert results[1] == [torch.tensor(0)] # dummy - # Only if keras model gets properly saved by the Policy's export_model() method. - # NOTE: This is not the case (yet) for TF Policies like SAC, which use ModelV2s - # that have more than one keras "base_model" properties in them. For example, - # SACTfModel contains `q_net` and `action_model`, both of which have their own - # `base_model`. - elif tf_expected_to_work: - model = tf.saved_model.load(os.path.join(export_dir, "model")) - assert model - results = model(tf.convert_to_tensor(test_obs, dtype=tf.float32)) - assert len(results) == 2 - assert results[0].shape in [(1, 2), (1, 3), (1, 256)], results[0].shape - # TODO (sven): Make non-RNN models NOT return states (empty list). - assert results[1].shape == (1, 1), results[1].shape # dummy state-out - shutil.rmtree(export_dir) print("Exporting policy (`default_policy`) model ", alg_name, export_dir) @@ -124,20 +107,6 @@ def export_test( assert results[0].shape in [(1, 2), (1, 3), (1, 256)], results[0].shape assert results[1] == [torch.tensor(0)] # dummy - # Only if keras model gets properly saved by the Policy's export_model() method. - # NOTE: This is not the case (yet) for TF Policies like SAC, which use ModelV2s - # that have more than one keras "base_model" properties in them. For example, - # SACTfModel contains `q_net` and `action_model`, both of which have their own - # `base_model`. - elif tf_expected_to_work: - model = tf.saved_model.load(export_dir) - assert model - results = model(tf.convert_to_tensor(test_obs, dtype=tf.float32)) - assert len(results) == 2 - assert results[0].shape in [(1, 2), (1, 3), (1, 256)], results[0].shape - # TODO (sven): Make non-RNN models NOT return states (empty list). - assert results[1].shape == (1, 1), results[1].shape # dummy state-out - if os.path.exists(export_dir): shutil.rmtree(export_dir) if multi_agent: @@ -156,20 +125,16 @@ def tearDownClass(cls) -> None: ray.shutdown() def test_export_appo(self): - for fw in framework_iterator(): - export_test("APPO", fw) + export_test("APPO", "torch") def test_export_ppo(self): - for fw in framework_iterator(): - export_test("PPO", fw) + export_test("PPO", "torch") def test_export_ppo_multi_agent(self): - for fw in framework_iterator(): - export_test("PPO", fw, multi_agent=True) + export_test("PPO", "torch", multi_agent=True) def test_export_sac(self): - for fw in framework_iterator(): - export_test("SAC", fw, tf_expected_to_work=False) + export_test("SAC", "torch") if __name__ == "__main__": diff --git a/rllib/policy/tests/test_policy.py b/rllib/policy/tests/test_policy.py index 6bd09c6e8ff3..751fc1c3ab03 100644 --- a/rllib/policy/tests/test_policy.py +++ b/rllib/policy/tests/test_policy.py @@ -6,7 +6,7 @@ from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 from ray.rllib.policy.policy import Policy from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check class TestPolicy(unittest.TestCase): @@ -20,40 +20,38 @@ def tearDownClass(cls) -> None: def test_policy_get_and_set_state(self): config = PPOConfig().environment("CartPole-v1") - for fw in framework_iterator(config): - algo = config.build() - policy = algo.get_policy() - state1 = policy.get_state() - algo.train() - state2 = policy.get_state() - check(state1["global_timestep"], state2["global_timestep"], false=True) - - # Reset policy to its original state and compare. - policy.set_state(state1) - state3 = policy.get_state() - # Make sure everything is the same. + algo = config.build() + policy = algo.get_policy() + state1 = policy.get_state() + algo.train() + state2 = policy.get_state() + check(state1["global_timestep"], state2["global_timestep"], false=True) + + # Reset policy to its original state and compare. + policy.set_state(state1) + state3 = policy.get_state() + # Make sure everything is the same. + # This is only supported without RLModule API. See AlgorithmConfig for + # more info. + if not config.enable_rl_module_and_learner: + check(state1["_exploration_state"], state3["_exploration_state"]) + check(state1["global_timestep"], state3["global_timestep"]) + check(state1["weights"], state3["weights"]) + + # Create a new Policy only from state (which could be part of an algorithm's + # checkpoint). This would allow users to restore a policy w/o having access + # to the original code (e.g. the config, policy class used, etc..). + if isinstance(policy, (EagerTFPolicyV2, DynamicTFPolicyV2, TorchPolicyV2)): + policy_restored_from_scratch = Policy.from_state(state3) + state4 = policy_restored_from_scratch.get_state() # This is only supported without RLModule API. See AlgorithmConfig for # more info. if not config.enable_rl_module_and_learner: - check(state1["_exploration_state"], state3["_exploration_state"]) - check(state1["global_timestep"], state3["global_timestep"]) - check(state1["weights"], state3["weights"]) - - # Create a new Policy only from state (which could be part of an algorithm's - # checkpoint). This would allow users to restore a policy w/o having access - # to the original code (e.g. the config, policy class used, etc..). - if isinstance(policy, (EagerTFPolicyV2, DynamicTFPolicyV2, TorchPolicyV2)): - policy_restored_from_scratch = Policy.from_state(state3) - state4 = policy_restored_from_scratch.get_state() - # This is only supported without RLModule API. See AlgorithmConfig for - # more info. - if not config.enable_rl_module_and_learner: - check(state3["_exploration_state"], state4["_exploration_state"]) - check(state3["global_timestep"], state4["global_timestep"]) - # For tf static graph, the new model has different layer names - # (as it gets written into the same graph as the old one). - if fw != "tf": - check(state3["weights"], state4["weights"]) + check(state3["_exploration_state"], state4["_exploration_state"]) + check(state3["global_timestep"], state4["global_timestep"]) + # For tf static graph, the new model has different layer names + # (as it gets written into the same graph as the old one). + check(state3["weights"], state4["weights"]) if __name__ == "__main__": diff --git a/rllib/policy/tests/test_policy_checkpoint_restore.py b/rllib/policy/tests/test_policy_checkpoint_restore.py index cc7598dc7710..93449c550fd4 100644 --- a/rllib/policy/tests/test_policy_checkpoint_restore.py +++ b/rllib/policy/tests/test_policy_checkpoint_restore.py @@ -10,7 +10,6 @@ from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.policy import Policy -from ray.rllib.utils.test_utils import framework_iterator def _do_checkpoint_twice_test(framework): @@ -20,25 +19,24 @@ def _do_checkpoint_twice_test(framework): .env_runners(num_env_runners=0) .evaluation(evaluation_num_env_runners=0) ) - for fw in framework_iterator(config, frameworks=[framework]): - algo1 = config.build(env="CartPole-v1") - algo2 = config.build(env="Pendulum-v1") + algo1 = config.build(env="CartPole-v1") + algo2 = config.build(env="Pendulum-v1") - algo1.train() - algo2.train() + algo1.train() + algo2.train() - policy1 = algo1.get_policy() - policy1.export_checkpoint("/tmp/test_policy_from_checkpoint_twice_p_1") + policy1 = algo1.get_policy() + policy1.export_checkpoint("/tmp/test_policy_from_checkpoint_twice_p_1") - policy2 = algo2.get_policy() - policy2.export_checkpoint("/tmp/test_policy_from_checkpoint_twice_p_2") + policy2 = algo2.get_policy() + policy2.export_checkpoint("/tmp/test_policy_from_checkpoint_twice_p_2") - algo1.stop() - algo2.stop() + algo1.stop() + algo2.stop() - # Create two policies from different checkpoints - Policy.from_checkpoint("/tmp/test_policy_from_checkpoint_twice_p_1") - Policy.from_checkpoint("/tmp/test_policy_from_checkpoint_twice_p_2") + # Create two policies from different checkpoints + Policy.from_checkpoint("/tmp/test_policy_from_checkpoint_twice_p_1") + Policy.from_checkpoint("/tmp/test_policy_from_checkpoint_twice_p_2") class TestPolicyFromCheckpoint(unittest.TestCase): @@ -50,12 +48,6 @@ def setUpClass(cls) -> None: def tearDownClass(cls) -> None: ray.shutdown() - def test_policy_from_checkpoint_twice_tf(self): - return _do_checkpoint_twice_test("tf") - - def test_policy_from_checkpoint_twice_tf2(self): - return _do_checkpoint_twice_test("tf2") - def test_policy_from_checkpoint_twice_torch(self): return _do_checkpoint_twice_test("torch") diff --git a/rllib/policy/tests/test_policy_state_swapping.py b/rllib/policy/tests/test_policy_state_swapping.py index ed1328d6c1b1..ca60bb0a58fd 100644 --- a/rllib/policy/tests/test_policy_state_swapping.py +++ b/rllib/policy/tests/test_policy_state_swapping.py @@ -4,19 +4,11 @@ import unittest import ray -from ray.rllib.algorithms.appo import ( - APPOConfig, - APPOTF1Policy, - APPOTF2Policy, - APPOTorchPolicy, -) +from ray.rllib.algorithms.appo import APPOConfig, APPOTorchPolicy from ray.rllib.policy.policy_map import PolicyMap -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check from ray.rllib.utils.tf_utils import get_tf_eager_cls_if_necessary -tf1, tf, tfv = try_import_tf() - class TestPolicyStateSwapping(unittest.TestCase): """Tests, whether Policies' states can be swapped out via their state on a GPU.""" @@ -34,9 +26,6 @@ def test_policy_swap_gpu(self): APPOConfig() # Use a single GPU for this test. .resources(num_gpus=1) - # Set eager tracing to True here, such that the framework_iterator loop - # below skips tf2 w/o tracing (loops through tf, tf2+tracing, and torch). - .framework("tf2") ) obs_space = gym.spaces.Box(-1.0, 1.0, (4,), dtype=np.float32) dummy_obs = obs_space.sample() @@ -44,99 +33,88 @@ def test_policy_swap_gpu(self): num_policies = 2 capacity = 1 - for fw in framework_iterator(config): - cls = get_tf_eager_cls_if_necessary( - APPOTF2Policy - if fw == "tf2" - else APPOTF1Policy - if fw == "tf" - else APPOTorchPolicy, - config, - ) + cls = get_tf_eager_cls_if_necessary(APPOTorchPolicy, config) - # Create empty, swappable-policies PolicyMap. - policy_map = PolicyMap(capacity=capacity, policy_states_are_swappable=True) - - # Create and add some TF2 policies. - for i in range(num_policies): - config.training(lr=(i + 1) * 0.01) - with tf1.variable_scope(f"Policy{i}"): - policy = cls( - observation_space=obs_space, - action_space=act_space, - config=config.to_dict(), - ) - policy_map[f"pol{i}"] = policy - - # Create a dummy batch with all 1.0s in it (instead of zeros), so we have a - # better chance of changing our weights during an update. - dummy_batch_ones = tree.map_structure( - lambda s: np.ones_like(s), - policy_map["pol0"]._dummy_batch, + # Create empty, swappable-policies PolicyMap. + policy_map = PolicyMap(capacity=capacity, policy_states_are_swappable=True) + + # Create and add some TF2 policies. + for i in range(num_policies): + config.training(lr=(i + 1) * 0.01) + policy = cls( + observation_space=obs_space, + action_space=act_space, + config=config.to_dict(), ) - dummy_batch_twos = tree.map_structure( - lambda s: np.full_like(s, 2.0), - policy_map["pol0"]._dummy_batch, + policy_map[f"pol{i}"] = policy + + # Create a dummy batch with all 1.0s in it (instead of zeros), so we have a + # better chance of changing our weights during an update. + dummy_batch_ones = tree.map_structure( + lambda s: np.ones_like(s), + policy_map["pol0"]._dummy_batch, + ) + dummy_batch_twos = tree.map_structure( + lambda s: np.full_like(s, 2.0), + policy_map["pol0"]._dummy_batch, + ) + + logits = { + pid: p.compute_single_action(dummy_obs)[2]["action_dist_inputs"] + for pid, p in policy_map.items() + } + # Make sure policies output different deterministic actions. Otherwise, + # this test would not work. + check(logits["pol0"], logits["pol1"], atol=0.0000001, false=True) + + # Test proper policy state swapping. + for i in range(50): + pid = f"pol{i % num_policies}" + print(i) + pol = policy_map[pid] + # Make sure config has been changed properly. + self.assertTrue(pol.config["lr"] == ((i % num_policies) + 1) * 0.01) + # After accessing `pid`, assume it's the most recently accessed + # item now. + self.assertTrue(policy_map._deque[-1] == pid) + self.assertTrue(len(policy_map._deque) == capacity) + self.assertTrue(len(policy_map.cache) == capacity) + self.assertTrue(pid in policy_map.cache) + # Actually compute one action to trigger tracing operations of + # the graph. These may be performed lazily by the DL framework. + check( + pol.compute_single_action(dummy_obs)[2]["action_dist_inputs"], + logits[pid], ) - logits = { - pid: p.compute_single_action(dummy_obs)[2]["action_dist_inputs"] - for pid, p in policy_map.items() - } - # Make sure policies output different deterministic actions. Otherwise, - # this test would not work. - check(logits["pol0"], logits["pol1"], atol=0.0000001, false=True) - - # Test proper policy state swapping. - for i in range(50): - pid = f"pol{i % num_policies}" - print(i) - pol = policy_map[pid] - # Make sure config has been changed properly. - self.assertTrue(pol.config["lr"] == ((i % num_policies) + 1) * 0.01) - # After accessing `pid`, assume it's the most recently accessed - # item now. - self.assertTrue(policy_map._deque[-1] == pid) - self.assertTrue(len(policy_map._deque) == capacity) - self.assertTrue(len(policy_map.cache) == capacity) - self.assertTrue(pid in policy_map.cache) - # Actually compute one action to trigger tracing operations of - # the graph. These may be performed lazily by the DL framework. - check( - pol.compute_single_action(dummy_obs)[2]["action_dist_inputs"], - logits[pid], - ) - - # Test, whether training (on the GPU) will affect the state swapping. - for i in range(num_policies): - pid = f"pol{i % num_policies}" - pol = policy_map[pid] - if i == 0: - pol.learn_on_batch(dummy_batch_ones) - else: - assert i == 1 - pol.learn_on_batch(dummy_batch_twos) - - # Make sure, we really changed the NN during training and update our - # actions dict. - old_logits = logits[pid] - logits[pid] = pol.compute_single_action(dummy_obs)[2][ - "action_dist_inputs" - ] - check(logits[pid], old_logits, atol=0.0000001, false=True) - - # Make sure policies output different deterministic actions. Otherwise, - # this test would not work. - check(logits["pol0"], logits["pol1"], atol=0.0000001, false=True) - - # Once more, test proper policy state swapping. - for i in range(50): - pid = f"pol{i % num_policies}" - pol = policy_map[pid] - check( - pol.compute_single_action(dummy_obs)[2]["action_dist_inputs"], - logits[pid], - ) + # Test, whether training (on the GPU) will affect the state swapping. + for i in range(num_policies): + pid = f"pol{i % num_policies}" + pol = policy_map[pid] + if i == 0: + pol.learn_on_batch(dummy_batch_ones) + else: + assert i == 1 + pol.learn_on_batch(dummy_batch_twos) + + # Make sure, we really changed the NN during training and update our + # actions dict. + old_logits = logits[pid] + logits[pid] = pol.compute_single_action(dummy_obs)[2]["action_dist_inputs"] + check(logits[pid], old_logits, atol=0.0000001, false=True) + + # Make sure policies output different deterministic actions. Otherwise, + # this test would not work. + check(logits["pol0"], logits["pol1"], atol=0.0000001, false=True) + + # Once more, test proper policy state swapping. + for i in range(50): + pid = f"pol{i % num_policies}" + pol = policy_map[pid] + check( + pol.compute_single_action(dummy_obs)[2]["action_dist_inputs"], + logits[pid], + ) if __name__ == "__main__": diff --git a/rllib/tests/backward_compat/checkpoints/create_checkpoints.py b/rllib/tests/backward_compat/checkpoints/create_checkpoints.py index 6a8e9ade5fb3..952d299d385f 100644 --- a/rllib/tests/backward_compat/checkpoints/create_checkpoints.py +++ b/rllib/tests/backward_compat/checkpoints/create_checkpoints.py @@ -3,7 +3,6 @@ # Checkpoints will be located in ~/ray_results/... from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.utils.test_utils import framework_iterator # Build a PPOConfig object. config = ( @@ -17,8 +16,7 @@ ) ) -for fw in framework_iterator(config): - algo = config.build() - results = algo.train() - algo.save() - algo.stop() +algo = config.build() +results = algo.train() +algo.save() +algo.stop() diff --git a/rllib/tests/backward_compat/test_backward_compat.py b/rllib/tests/backward_compat/test_backward_compat.py index e29daf2a29b1..1368ab85aa30 100644 --- a/rllib/tests/backward_compat/test_backward_compat.py +++ b/rllib/tests/backward_compat/test_backward_compat.py @@ -1,19 +1,11 @@ -import os -from pathlib import Path -from packaging import version import sys import unittest import ray -import ray.cloudpickle as pickle -from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.algorithms.algorithm_config import AlgorithmConfig from ray.rllib.algorithms.dqn import DQN -from ray.rllib.algorithms.ppo import PPO from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole -from ray.rllib.policy.policy import Policy, PolicySpec -from ray.rllib.utils.checkpoints import get_checkpoint_info -from ray.rllib.utils.test_utils import framework_iterator +from ray.rllib.policy.policy import PolicySpec from ray.tune.registry import register_env @@ -26,58 +18,6 @@ def setUpClass(cls): def tearDownClass(cls): ray.shutdown() - def test_old_checkpoint_formats(self): - """Tests, whether we remain backward compatible (>=2.0.0) wrt checkpoints.""" - - rllib_dir = Path(__file__).parent.parent.parent - print(f"rllib dir={rllib_dir} exists={os.path.isdir(rllib_dir)}") - - # TODO: Once checkpoints are python version independent (once we stop using - # pickle), add 1.0 here as well. - # Broken due to gymnasium move (old gym envs not recoverable via pickle due to - # gym version conflict (gym==0.23.x not compatible with gym==0.26.x)). - for v in []: # "0.1" - v = version.Version(v) - for fw in framework_iterator(): - path_to_checkpoint = os.path.join( - rllib_dir, - "tests", - "backward_compat", - "checkpoints", - "v" + str(v), - "ppo_frozenlake_" + fw, - ) - - print( - f"path_to_checkpoint={path_to_checkpoint} " - f"exists={os.path.isdir(path_to_checkpoint)}" - ) - - checkpoint_info = get_checkpoint_info(path_to_checkpoint) - # v0.1: Need to create algo first, then restore. - if checkpoint_info["checkpoint_version"] == version.Version("0.1"): - # For checkpoints <= v0.1, we need to magically know the original - # config used as well as the algo class. - with open(checkpoint_info["state_file"], "rb") as f: - state = pickle.load(f) - worker_state = pickle.loads(state["worker"]) - algo = PPO(config=worker_state["policy_config"]) - # Note, we can not use restore() here because the testing - # checkpoints are created with Algorithm.save() by - # checkpoints/create_checkpoints.py. I.e, they are missing - # all the Tune checkpoint metadata. - algo.load_checkpoint(path_to_checkpoint) - # > v0.1: Simply use new `Algorithm.from_checkpoint()` staticmethod. - else: - algo = Algorithm.from_checkpoint(path_to_checkpoint) - - # Also test restoring a Policy from an algo checkpoint. - policies = Policy.from_checkpoint(path_to_checkpoint) - self.assertTrue("default_policy" in policies) - - print(algo.train()) - algo.stop() - def test_old_algorithm_config_dicts(self): """Tests, whether we can build Algorithm objects with old config dicts.""" diff --git a/rllib/tests/test_gpus.py b/rllib/tests/test_gpus.py index 4bbd769c5b8f..54ef39821f23 100644 --- a/rllib/tests/test_gpus.py +++ b/rllib/tests/test_gpus.py @@ -5,7 +5,6 @@ from ray.air.constants import TRAINING_ITERATION from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.test_utils import framework_iterator from ray import tune torch, _ = try_import_torch() @@ -43,44 +42,38 @@ def test_gpus_in_non_local_mode(self): f"_fake_gpus={fake_gpus}" ) - frameworks = ( - ("tf", "torch") if num_gpus > 1 else ("tf2", "tf", "torch") - ) - for _ in framework_iterator(config, frameworks=frameworks): - # Expect that Algorithm creation causes a num_gpu error. - if ( - actual_gpus < num_gpus + 2 * num_gpus_per_env_runner - and not fake_gpus - ): - # "Direct" RLlib (create Algorithm on the driver). - # Cannot run through ray.tune.Tuner().fit() as it would - # simply wait infinitely for the resources to - # become available. - print("direct RLlib") - self.assertRaisesRegex( - RuntimeError, - "Found 0 GPUs on your machine", - lambda: config.build(), - ) - # If actual_gpus >= num_gpus or faked, - # expect no error. - else: - print("direct RLlib") - algo = config.build() - algo.stop() - # Cannot run through ray.tune.Tuner().fit() w/ fake GPUs - # as it would simply wait infinitely for the - # resources to become available (even though, we - # wouldn't really need them). - if num_gpus == 0: - print("via ray.tune.Tuner().fit()") - tune.Tuner( - "PPO", - param_space=config, - run_config=air.RunConfig( - stop={TRAINING_ITERATION: 0} - ), - ).fit() + # Expect that Algorithm creation causes a num_gpu error. + if ( + actual_gpus < num_gpus + 2 * num_gpus_per_env_runner + and not fake_gpus + ): + # "Direct" RLlib (create Algorithm on the driver). + # Cannot run through ray.tune.Tuner().fit() as it would + # simply wait infinitely for the resources to + # become available. + print("direct RLlib") + self.assertRaisesRegex( + RuntimeError, + "Found 0 GPUs on your machine", + lambda: config.build(), + ) + # If actual_gpus >= num_gpus or faked, + # expect no error. + else: + print("direct RLlib") + algo = config.build() + algo.stop() + # Cannot run through ray.tune.Tuner().fit() w/ fake GPUs + # as it would simply wait infinitely for the + # resources to become available (even though, we + # wouldn't really need them). + if num_gpus == 0: + print("via ray.tune.Tuner().fit()") + tune.Tuner( + "PPO", + param_space=config, + run_config=air.RunConfig(stop={TRAINING_ITERATION: 0}), + ).fit() ray.shutdown() def test_gpus_in_local_mode(self): @@ -97,17 +90,15 @@ def test_gpus_in_local_mode(self): for fake_gpus in [False, True]: print(f"_fake_gpus={fake_gpus}") config.resources(num_gpus=num_gpus, _fake_gpus=fake_gpus) - frameworks = ("tf", "torch") if num_gpus > 1 else ("tf2", "tf", "torch") - for _ in framework_iterator(config, frameworks=frameworks): - print("direct RLlib") - algo = config.build() - algo.stop() - print("via ray.tune.Tuner().fit()") - tune.Tuner( - "PPO", - param_space=config, - run_config=air.RunConfig(stop={TRAINING_ITERATION: 0}), - ).fit() + print("direct RLlib") + algo = config.build() + algo.stop() + print("via ray.tune.Tuner().fit()") + tune.Tuner( + "PPO", + param_space=config, + run_config=air.RunConfig(stop={TRAINING_ITERATION: 0}), + ).fit() ray.shutdown() diff --git a/rllib/tests/test_io.py b/rllib/tests/test_io.py index 027e1382cc26..0fe968a2ae61 100644 --- a/rllib/tests/test_io.py +++ b/rllib/tests/test_io.py @@ -31,7 +31,6 @@ EVALUATION_RESULTS, NUM_ENV_STEPS_SAMPLED_LIFETIME, ) -from ray.rllib.utils.test_utils import framework_iterator SAMPLES = SampleBatch( { @@ -71,33 +70,30 @@ def write_outputs(self, output, fw, output_config=None): return algo def test_agent_output_ok(self): - for fw in framework_iterator(frameworks=("torch", "tf")): - self.write_outputs(self.test_dir, fw) - # PPO has two workers, so we expect 2 output files. - self.assertEqual(len(os.listdir(self.test_dir + fw)), 2) - reader = JsonReader(self.test_dir + fw + "/*.json") - reader.next() + self.write_outputs(self.test_dir, "torch") + # PPO has two workers, so we expect 2 output files. + self.assertEqual(len(os.listdir(self.test_dir + "torch")), 2) + reader = JsonReader(self.test_dir + "torch" + "/*.json") + reader.next() def test_agent_output_logdir(self): """Test special value 'logdir' as Agent's output.""" - for fw in framework_iterator(): - agent = self.write_outputs("logdir", fw) - # PPO has two workers, so we expect 2 output files. - self.assertEqual(len(glob.glob(agent.logdir + "/output-*.json")), 2) + agent = self.write_outputs("logdir", "torch") + # PPO has two workers, so we expect 2 output files. + self.assertEqual(len(glob.glob(agent.logdir + "/output-*.json")), 2) def test_agent_output_infos(self): """Verify that the infos dictionary is written to the output files. Note, with torch this is always the case.""" output_config = {"store_infos": True} - for fw in framework_iterator(frameworks=("torch", "tf")): - self.write_outputs(self.test_dir, fw, output_config=output_config) - # PPO has two workers, so we expect 2 output files. - self.assertEqual(len(os.listdir(self.test_dir + fw)), 2) - reader = JsonReader(self.test_dir + fw + "/*.json") - data = reader.next() - data = convert_ma_batch_to_sample_batch(data) - self.assertTrue("infos" in data) + self.write_outputs(self.test_dir, "torch", output_config=output_config) + # PPO has two workers, so we expect 2 output files. + self.assertEqual(len(os.listdir(self.test_dir + "torch")), 2) + reader = JsonReader(self.test_dir + "torch" + "/*.json") + data = reader.next() + data = convert_ma_batch_to_sample_batch(data) + self.assertTrue("infos" in data) def test_agent_input_dir(self): config = ( @@ -107,18 +103,17 @@ def test_agent_input_dir(self): .training(train_batch_size=250) ) - for fw in framework_iterator(config, frameworks=("torch", "tf")): - self.write_outputs(self.test_dir, fw) - config.offline_data( - input_=self.test_dir + fw, - ) - print("WROTE TO: ", self.test_dir) - algo = config.build() - result = algo.train() - self.assertEqual( - result[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"], 250 - ) # read from input - self.assertTrue(np.isnan(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN])) + self.write_outputs(self.test_dir, "torch") + config.offline_data( + input_=self.test_dir + "torch", + ) + print("WROTE TO: ", self.test_dir) + algo = config.build() + result = algo.train() + self.assertEqual( + result[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"], 250 + ) # read from input + self.assertTrue(np.isnan(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN])) def test_split_by_episode(self): splits = SAMPLES.split_by_episode() @@ -138,39 +133,38 @@ def test_agent_input_postprocessing_enabled(self): .evaluation(off_policy_estimation_methods={}) ) - for fw in framework_iterator(config, frameworks=("tf", "torch")): - self.write_outputs(self.test_dir, fw) - config.offline_data(input_=self.test_dir + fw) - - # Rewrite the files to drop advantages and value_targets for - # testing - for path in glob.glob(self.test_dir + fw + "/*.json"): - out = [] - with open(path) as f: - for line in f.readlines(): - data_string = json.loads(line) - data = from_json_data(data_string, None) - data = convert_ma_batch_to_sample_batch(data) - # Data won't contain rewards as these are not included - # in the write_outputs run (not needed in the - # SampleBatch). Flip out "rewards" for "advantages" - # just for testing. - data["rewards"] = data["advantages"] - del data["advantages"] - if "value_targets" in data: - del data["value_targets"] - out.append(_to_json_dict(data, [])) - with open(path, "w") as f: - for data in out: - f.write(json.dumps(data)) + self.write_outputs(self.test_dir, "torch") + config.offline_data(input_=self.test_dir + "torch") + + # Rewrite the files to drop advantages and value_targets for + # testing + for path in glob.glob(self.test_dir + "torch" + "/*.json"): + out = [] + with open(path) as f: + for line in f.readlines(): + data_string = json.loads(line) + data = from_json_data(data_string, None) + data = convert_ma_batch_to_sample_batch(data) + # Data won't contain rewards as these are not included + # in the write_outputs run (not needed in the + # SampleBatch). Flip out "rewards" for "advantages" + # just for testing. + data["rewards"] = data["advantages"] + del data["advantages"] + if "value_targets" in data: + del data["value_targets"] + out.append(_to_json_dict(data, [])) + with open(path, "w") as f: + for data in out: + f.write(json.dumps(data)) - algo = config.build() - result = algo.train() - self.assertEqual( - result[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"], 250 - ) # read from input - self.assertTrue(np.isnan(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN])) - algo.stop() + algo = config.build() + result = algo.train() + self.assertEqual( + result[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"], 250 + ) # read from input + self.assertTrue(np.isnan(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN])) + algo.stop() def test_agent_input_eval_sampler(self): config = ( @@ -185,18 +179,17 @@ def test_agent_input_eval_sampler(self): ) ) - for fw in framework_iterator(config, frameworks=["tf", "torch"]): - self.write_outputs(self.test_dir, fw) - config.offline_data(input_=self.test_dir + fw) - algo = config.build() - result = algo.train() - assert np.isnan( - result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] - ), "episode reward should not be computed for offline data" - assert not np.isnan( - result[EVALUATION_RESULTS][ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] - ), "Did not see simulation results during evaluation" - algo.stop() + self.write_outputs(self.test_dir, "torch") + config.offline_data(input_=self.test_dir + "torch") + algo = config.build() + result = algo.train() + assert np.isnan( + result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + ), "episode reward should not be computed for offline data" + assert not np.isnan( + result[EVALUATION_RESULTS][ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + ), "Did not see simulation results during evaluation" + algo.stop() def test_agent_input_list(self): config = ( @@ -206,33 +199,29 @@ def test_agent_input_list(self): .evaluation(off_policy_estimation_methods={}) ) - for fw in framework_iterator(config, frameworks=("torch", "tf")): - self.write_outputs(self.test_dir, fw) - config.offline_data(input_=glob.glob(self.test_dir + fw + "/*.json")) - algo = config.build() - result = algo.train() - self.assertEqual( - result[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"], 250 - ) # read from input - self.assertTrue(np.isnan(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN])) - algo.stop() + self.write_outputs(self.test_dir, "torch") + config.offline_data(input_=glob.glob(self.test_dir + "torch" + "/*.json")) + algo = config.build() + result = algo.train() + self.assertEqual( + result[f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}"], 250 + ) # read from input + self.assertTrue(np.isnan(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN])) + algo.stop() def test_agent_input_dict(self): config = PPOConfig().environment("CartPole-v1").training(train_batch_size=2000) - for fw in framework_iterator(config): - self.write_outputs(self.test_dir, fw) - config.offline_data( - input_={ - self.test_dir + fw: 0.1, - "sampler": 0.9, - } - ) - algo = config.build() - result = algo.train() - self.assertTrue( - not np.isnan(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]) - ) - algo.stop() + self.write_outputs(self.test_dir, "torch") + config.offline_data( + input_={ + self.test_dir + "torch": 0.1, + "sampler": 0.9, + } + ) + algo = config.build() + result = algo.train() + self.assertTrue(not np.isnan(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN])) + algo.stop() def test_custom_input_procedure(self): class CustomJsonReader(JsonReader): @@ -258,16 +247,13 @@ def input_creator(ioctx: IOContext) -> InputReader: .evaluation(off_policy_estimation_methods={}) ) - for fw in framework_iterator(config, frameworks=("torch", "tf")): - self.write_outputs(self.test_dir, fw) - config.offline_data(input_config={"input_files": self.test_dir + fw}) - algo = config.build() - result = algo.train() - self.assertEqual(result[NUM_ENV_STEPS_SAMPLED_LIFETIME], 4000) - self.assertTrue( - np.isnan(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN]) - ) - algo.stop() + self.write_outputs(self.test_dir, "torch") + config.offline_data(input_config={"input_files": self.test_dir + "torch"}) + algo = config.build() + result = algo.train() + self.assertEqual(result[NUM_ENV_STEPS_SAMPLED_LIFETIME], 4000) + self.assertTrue(np.isnan(result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN])) + algo.stop() def test_multiple_output_workers(self): ray.shutdown() @@ -281,14 +267,13 @@ def test_multiple_output_workers(self): .evaluation(off_policy_estimation_methods={}) ) - for fw in framework_iterator(config, frameworks=["tf", "torch"]): - config.offline_data(output=self.test_dir + fw) - algo = config.build() - algo.train() - self.assertEqual(len(os.listdir(self.test_dir + fw)), 2) - reader = JsonReader(self.test_dir + fw + "/*.json") - reader.next() - algo.stop() + config.offline_data(output=self.test_dir + "torch") + algo = config.build() + algo.train() + self.assertEqual(len(os.listdir(self.test_dir + "torch")), 2) + reader = JsonReader(self.test_dir + "torch" + "/*.json") + reader.next() + algo.stop() class JsonIOTest(unittest.TestCase): diff --git a/rllib/tests/test_local.py b/rllib/tests/test_local.py index 7664a8158cff..38f87ff099f4 100644 --- a/rllib/tests/test_local.py +++ b/rllib/tests/test_local.py @@ -2,7 +2,6 @@ import ray from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.utils.test_utils import framework_iterator class LocalModeTest(unittest.TestCase): @@ -20,10 +19,9 @@ def test_local(self): .training(model={"fcnet_hiddens": [10]}) ) - for _ in framework_iterator(config): - algo = config.build() - print(algo.train()) - algo.stop() + algo = config.build() + print(algo.train()) + algo.stop() if __name__ == "__main__": diff --git a/rllib/tests/test_nn_framework_import_errors.py b/rllib/tests/test_nn_framework_import_errors.py index 0ab0f5fb8f8e..61c06816d09d 100644 --- a/rllib/tests/test_nn_framework_import_errors.py +++ b/rllib/tests/test_nn_framework_import_errors.py @@ -3,18 +3,6 @@ import pytest import ray.rllib.algorithms.ppo as ppo -from ray.rllib.utils.test_utils import framework_iterator - - -def test_dont_import_tf_error(): - """Check error being thrown, if tf not installed but configured.""" - # Do not import tf for testing purposes. - os.environ["RLLIB_TEST_NO_TF_IMPORT"] = "1" - - config = ppo.PPOConfig().environment("CartPole-v1") - for _ in framework_iterator(config, frameworks=("tf", "tf2")): - with pytest.raises(ImportError, match="However, no installation was found"): - config.build() def test_dont_import_torch_error(): @@ -27,5 +15,4 @@ def test_dont_import_torch_error(): if __name__ == "__main__": - test_dont_import_tf_error() test_dont_import_torch_error() diff --git a/rllib/tests/test_reproducibility.py b/rllib/tests/test_reproducibility.py index 1cd89d5ecc1f..682fd1984ef5 100644 --- a/rllib/tests/test_reproducibility.py +++ b/rllib/tests/test_reproducibility.py @@ -9,7 +9,6 @@ EPISODE_RETURN_MIN, ENV_RUNNER_RESULTS, ) -from ray.rllib.utils.test_utils import framework_iterator from ray.tune.registry import register_env @@ -33,48 +32,46 @@ def step(self, action): def env_creator(env_config): return PickLargest() - for fw in framework_iterator(frameworks=("tf", "torch")): - trajs = list() - for trial in range(3): - ray.init() - register_env("PickLargest", env_creator) - config = ( - DQNConfig() - .environment("PickLargest") - .debugging(seed=666 if trial in [0, 1] else 999) - .reporting( - min_time_s_per_iteration=0, - min_sample_timesteps_per_iteration=100, - ) - .framework(fw) + trajs = [] + for trial in range(3): + ray.init() + register_env("PickLargest", env_creator) + config = ( + DQNConfig() + .environment("PickLargest") + .debugging(seed=666 if trial in [0, 1] else 999) + .reporting( + min_time_s_per_iteration=0, + min_sample_timesteps_per_iteration=100, ) - algo = config.build() + ) + algo = config.build() - trajectory = list() - for _ in range(8): - r = algo.train() - trajectory.append(r[ENV_RUNNER_RESULTS][EPISODE_RETURN_MAX]) - trajectory.append(r[ENV_RUNNER_RESULTS][EPISODE_RETURN_MIN]) - trajs.append(trajectory) + trajectory = list() + for _ in range(8): + r = algo.train() + trajectory.append(r[ENV_RUNNER_RESULTS][EPISODE_RETURN_MAX]) + trajectory.append(r[ENV_RUNNER_RESULTS][EPISODE_RETURN_MIN]) + trajs.append(trajectory) - algo.stop() - ray.shutdown() + algo.stop() + ray.shutdown() - # trial0 and trial1 use same seed and thus - # expect identical trajectories. - all_same = True - for v0, v1 in zip(trajs[0], trajs[1]): - if v0 != v1: - all_same = False - self.assertTrue(all_same) + # trial0 and trial1 use same seed and thus + # expect identical trajectories. + all_same = True + for v0, v1 in zip(trajs[0], trajs[1]): + if v0 != v1: + all_same = False + self.assertTrue(all_same) - # trial1 and trial2 use different seeds and thus - # most rewards tend to be different. - diff_cnt = 0 - for v1, v2 in zip(trajs[1], trajs[2]): - if v1 != v2: - diff_cnt += 1 - self.assertTrue(diff_cnt > 8) + # trial1 and trial2 use different seeds and thus + # most rewards tend to be different. + diff_cnt = 0 + for v1, v2 in zip(trajs[1], trajs[2]): + if v1 != v2: + diff_cnt += 1 + self.assertTrue(diff_cnt > 8) if __name__ == "__main__": diff --git a/rllib/tests/test_supported_multi_agent.py b/rllib/tests/test_supported_multi_agent.py index edd26e5443bf..469dba2ea790 100644 --- a/rllib/tests/test_supported_multi_agent.py +++ b/rllib/tests/test_supported_multi_agent.py @@ -11,7 +11,7 @@ MultiAgentMountainCar, ) from ray.rllib.policy.policy import PolicySpec -from ray.rllib.utils.test_utils import check_train_results, framework_iterator +from ray.rllib.utils.test_utils import check_train_results from ray.tune.registry import register_env @@ -36,18 +36,15 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): config.multi_agent(policies=policies, policy_mapping_fn=policy_mapping_fn) - for fw in framework_iterator(config): - if fw == "tf2" and alg == "IMPALA": - continue - if alg == "SAC": - a = config.build(env="multi_agent_mountaincar") - else: - a = config.build(env="multi_agent_cartpole") - - results = a.train() - check_train_results(results) - print(results) - a.stop() + if alg == "SAC": + a = config.build(env="multi_agent_mountaincar") + else: + a = config.build(env="multi_agent_cartpole") + + results = a.train() + check_train_results(results) + print(results) + a.stop() class TestSupportedMultiAgentPolicyGradient(unittest.TestCase): diff --git a/rllib/tests/test_timesteps.py b/rllib/tests/test_timesteps.py index 9725bb22ebdb..6b95864d26aa 100644 --- a/rllib/tests/test_timesteps.py +++ b/rllib/tests/test_timesteps.py @@ -4,7 +4,7 @@ import ray import ray.rllib.algorithms.ppo as ppo from ray.rllib.examples.envs.classes.random_env import RandomEnv -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check class TestTimeSteps(unittest.TestCase): @@ -34,27 +34,26 @@ def test_timesteps(self): obs = np.array(1) obs_batch = np.array([1]) - for _ in framework_iterator(config): - algo = config.build() - policy = algo.get_policy() - - for i in range(1, 21): - algo.compute_single_action(obs) - check(int(policy.global_timestep), i) - for i in range(1, 21): - policy.compute_actions(obs_batch) - check(int(policy.global_timestep), i + 20) - - # Artificially set ts to 100Bio, then keep computing actions and - # train. - crazy_timesteps = int(1e11) - policy.on_global_var_update({"timestep": crazy_timesteps}) - # Run for 10 more ts. - for i in range(1, 11): - policy.compute_actions(obs_batch) - check(int(policy.global_timestep), i + crazy_timesteps) - algo.train() - algo.stop() + algo = config.build() + policy = algo.get_policy() + + for i in range(1, 21): + algo.compute_single_action(obs) + check(int(policy.global_timestep), i) + for i in range(1, 21): + policy.compute_actions(obs_batch) + check(int(policy.global_timestep), i + 20) + + # Artificially set ts to 100Bio, then keep computing actions and + # train. + crazy_timesteps = int(1e11) + policy.on_global_var_update({"timestep": crazy_timesteps}) + # Run for 10 more ts. + for i in range(1, 11): + policy.compute_actions(obs_batch) + check(int(policy.global_timestep), i + crazy_timesteps) + algo.train() + algo.stop() if __name__ == "__main__": diff --git a/rllib/utils/__init__.py b/rllib/utils/__init__.py index 479438daa533..01f8404da2f0 100644 --- a/rllib/utils/__init__.py +++ b/rllib/utils/__init__.py @@ -34,7 +34,6 @@ check, check_compute_single_action, check_train_results, - framework_iterator, ) from ray.tune.utils import merge_dicts, deep_update @@ -115,7 +114,6 @@ def __exit__(self, *args): "fc", "force_list", "force_tuple", - "framework_iterator", "lstm", "merge_dicts", "one_hot", diff --git a/rllib/utils/exploration/tests/test_curiosity.py b/rllib/utils/exploration/tests/test_curiosity.py index a0f91ce0c7cf..ddc5939c5df5 100644 --- a/rllib/utils/exploration/tests/test_curiosity.py +++ b/rllib/utils/exploration/tests/test_curiosity.py @@ -10,7 +10,7 @@ from ray.air.constants import TRAINING_ITERATION from ray.rllib.algorithms.callbacks import DefaultCallbacks import ray.rllib.algorithms.ppo as ppo -from ray.rllib.utils.test_utils import check_learning_achieved, framework_iterator +from ray.rllib.utils.test_utils import check_learning_achieved from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MAX, @@ -194,36 +194,35 @@ def test_curiosity_on_frozen_lake(self): ) num_iterations = 10 - for _ in framework_iterator(config, frameworks=("tf", "torch")): - # W/ Curiosity. Expect to learn something. - algo = config.build() - learnt = False - for i in range(num_iterations): - result = algo.train() - print(result) - if result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MAX] > 0.0: - print("Reached goal after {} iters!".format(i)) - learnt = True - break - algo.stop() - self.assertTrue(learnt) + # W/ Curiosity. Expect to learn something. + algo = config.build() + learnt = False + for i in range(num_iterations): + result = algo.train() + print(result) + if result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MAX] > 0.0: + print("Reached goal after {} iters!".format(i)) + learnt = True + break + algo.stop() + self.assertTrue(learnt) - # Disable this check for now. Add too much flakyness to test. - # if fw == "tf": - # # W/o Curiosity. Expect to learn nothing. - # print("Trying w/o curiosity (not expected to learn).") - # config["exploration_config"] = { - # "type": "StochasticSampling", - # } - # algo = ppo.PPO(config=config) - # rewards_wo = 0.0 - # for _ in range(num_iterations): - # result = algo.train() - # rewards_wo += result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] - # print(result) - # algo.stop() - # self.assertTrue(rewards_wo == 0.0) - # print("Did not reach goal w/o curiosity!") + # Disable this check for now. Add too much flakyness to test. + # if fw == "tf": + # # W/o Curiosity. Expect to learn nothing. + # print("Trying w/o curiosity (not expected to learn).") + # config["exploration_config"] = { + # "type": "StochasticSampling", + # } + # algo = ppo.PPO(config=config) + # rewards_wo = 0.0 + # for _ in range(num_iterations): + # result = algo.train() + # rewards_wo += result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MEAN] + # print(result) + # algo.stop() + # self.assertTrue(rewards_wo == 0.0) + # print("Did not reach goal w/o curiosity!") def test_curiosity_on_partially_observable_domain(self): config = ( @@ -273,41 +272,40 @@ def test_curiosity_on_partially_observable_domain(self): TRAINING_ITERATION: 25, f"{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": min_reward, } - for _ in framework_iterator(config, frameworks="torch"): - # To replay: - # algo = ppo.PPO(config=config) - # algo.restore("[checkpoint file]") - # env = env_maker(config["env_config"]) - # obs, info = env.reset() - # for _ in range(10000): - # obs, reward, done, truncated, info = env.step( - # algo.compute_single_action(s) - # ) - # if done: - # obs, info = env.reset() - # env.render() + # To replay: + # algo = ppo.PPO(config=config) + # algo.restore("[checkpoint file]") + # env = env_maker(config["env_config"]) + # obs, info = env.reset() + # for _ in range(10000): + # obs, reward, done, truncated, info = env.step( + # algo.compute_single_action(s) + # ) + # if done: + # obs, info = env.reset() + # env.render() - results = tune.Tuner( - "PPO", - param_space=config, - run_config=air.RunConfig(stop=stop, verbose=1), - ).fit() - check_learning_achieved(results, min_reward) - iters = results.get_best_result().metrics[TRAINING_ITERATION] - print("Reached in {} iterations.".format(iters)) + results = tune.Tuner( + "PPO", + param_space=config, + run_config=air.RunConfig(stop=stop, verbose=1), + ).fit() + check_learning_achieved(results, min_reward) + iters = results.get_best_result().metrics[TRAINING_ITERATION] + print("Reached in {} iterations.".format(iters)) - # config_wo = config.copy() - # config_wo["exploration_config"] = {"type": "StochasticSampling"} - # stop_wo = stop.copy() - # stop_wo[TRAINING_ITERATION] = iters - # results = tune.Tuner( - # "PPO", param_space=config_wo, stop=stop_wo, verbose=1).fit() - # try: - # check_learning_achieved(results, min_reward) - # except ValueError: - # print("Did not learn w/o curiosity (expected).") - # else: - # raise ValueError("Learnt w/o curiosity (not expected)!") + # config_wo = config.copy() + # config_wo["exploration_config"] = {"type": "StochasticSampling"} + # stop_wo = stop.copy() + # stop_wo[TRAINING_ITERATION] = iters + # results = tune.Tuner( + # "PPO", param_space=config_wo, stop=stop_wo, verbose=1).fit() + # try: + # check_learning_achieved(results, min_reward) + # except ValueError: + # print("Did not learn w/o curiosity (expected).") + # else: + # raise ValueError("Learnt w/o curiosity (not expected)!") if __name__ == "__main__": diff --git a/rllib/utils/exploration/tests/test_explorations.py b/rllib/utils/exploration/tests/test_explorations.py index d63e879e7feb..0254664446ab 100644 --- a/rllib/utils/exploration/tests/test_explorations.py +++ b/rllib/utils/exploration/tests/test_explorations.py @@ -7,62 +7,60 @@ import ray.rllib.algorithms.impala as impala import ray.rllib.algorithms.ppo as ppo import ray.rllib.algorithms.sac as sac -from ray.rllib.utils import check, framework_iterator +from ray.rllib.utils import check def do_test_explorations(config, dummy_obs, prev_a=None, expected_mean_action=None): """Calls an Agent's `compute_actions` with different `explore` options.""" - # Test all frameworks. - for _ in framework_iterator(config): - print(f"Algorithm={config.algo_class}") - - # Test for both the default Agent's exploration AND the `Random` - # exploration class. - for exploration in [None, "Random"]: - local_config = config.copy() - if exploration == "Random": - if local_config.enable_rl_module_and_learner: - # TODO(Artur): Support Random exploration with RL Modules. - continue - local_config.env_runners(exploration_config={"type": "Random"}) - print("exploration={}".format(exploration or "default")) - - algo = local_config.build() - - # Make sure all actions drawn are the same, given same - # observations. - actions = [] - for _ in range(25): - actions.append( - algo.compute_single_action( - observation=dummy_obs, - explore=False, - prev_action=prev_a, - prev_reward=1.0 if prev_a is not None else None, - ) + print(f"Algorithm={config.algo_class}") + + # Test for both the default Agent's exploration AND the `Random` + # exploration class. + for exploration in [None, "Random"]: + local_config = config.copy() + if exploration == "Random": + if local_config.enable_rl_module_and_learner: + # TODO(Artur): Support Random exploration with RL Modules. + continue + local_config.env_runners(exploration_config={"type": "Random"}) + print("exploration={}".format(exploration or "default")) + + algo = local_config.build() + + # Make sure all actions drawn are the same, given same + # observations. + actions = [] + for _ in range(25): + actions.append( + algo.compute_single_action( + observation=dummy_obs, + explore=False, + prev_action=prev_a, + prev_reward=1.0 if prev_a is not None else None, ) - check(actions[-1], actions[0]) - - # Make sure actions drawn are different - # (around some mean value), given constant observations. - actions = [] - for _ in range(500): - actions.append( - algo.compute_single_action( - observation=dummy_obs, - explore=True, - prev_action=prev_a, - prev_reward=1.0 if prev_a is not None else None, - ) + ) + check(actions[-1], actions[0]) + + # Make sure actions drawn are different + # (around some mean value), given constant observations. + actions = [] + for _ in range(500): + actions.append( + algo.compute_single_action( + observation=dummy_obs, + explore=True, + prev_action=prev_a, + prev_reward=1.0 if prev_a is not None else None, ) - check( - np.mean(actions), - expected_mean_action if expected_mean_action is not None else 0.5, - atol=0.4, ) - # Check that the stddev is not 0.0 (values differ). - check(np.std(actions), 0.0, false=True) + check( + np.mean(actions), + expected_mean_action if expected_mean_action is not None else 0.5, + atol=0.4, + ) + # Check that the stddev is not 0.0 (values differ). + check(np.std(actions), 0.0, false=True) class TestExplorations(unittest.TestCase): diff --git a/rllib/utils/exploration/tests/test_random_encoder.py b/rllib/utils/exploration/tests/test_random_encoder.py deleted file mode 100644 index aa8a021e28b9..000000000000 --- a/rllib/utils/exploration/tests/test_random_encoder.py +++ /dev/null @@ -1,84 +0,0 @@ -import sys -import unittest - -import pytest -import ray -from ray.rllib.utils.test_utils import framework_iterator -import ray.rllib.algorithms.ppo as ppo -import ray.rllib.algorithms.sac as sac -from ray.rllib.algorithms.callbacks import RE3UpdateCallbacks -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MAX, -) - - -class TestRE3(unittest.TestCase): - """Tests for RE3 exploration algorithm.""" - - @classmethod - def setUpClass(cls): - ray.init() - - @classmethod - def tearDownClass(cls): - ray.shutdown() - - def run_re3(self, rl_algorithm): - """Tests RE3 for PPO and SAC. - - Both the on-policy and off-policy setups are validated. - """ - if rl_algorithm == "PPO": - # We need to disable the RLModule / Learner API here, since this test is - # overfitted to the ModelV2 API stack. The random encoder is based on - # ModelV2 stack. - config = ppo.PPOConfig() - algo_cls = ppo.PPO - beta_schedule = "constant" - elif rl_algorithm == "SAC": - config = sac.SACConfig() - algo_cls = sac.SAC - beta_schedule = "linear_decay" - - config = config.to_dict() - - class RE3Callbacks(RE3UpdateCallbacks, config["callbacks"]): - pass - - config["env"] = "Pendulum-v1" - config["callbacks"] = RE3Callbacks - config["exploration_config"] = { - "type": "RE3", - "embeds_dim": 128, - "beta_schedule": beta_schedule, - "sub_exploration": { - "type": "StochasticSampling", - }, - } - - num_iterations = 60 - for _ in framework_iterator(config, frameworks=("tf", "tf2"), session=True): - algo = algo_cls(config=config) - learnt = False - for i in range(num_iterations): - result = algo.train() - print(result) - if result[ENV_RUNNER_RESULTS][EPISODE_RETURN_MAX] > -900.0: - print("Reached goal after {} iters!".format(i)) - learnt = True - break - algo.stop() - self.assertTrue(learnt) - - def test_re3_ppo(self): - """Tests RE3 with PPO.""" - self.run_re3("PPO") - - def test_re3_sac(self): - """Tests RE3 with SAC.""" - self.run_re3("SAC") - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/utils/schedules/tests/test_schedules.py b/rllib/utils/schedules/tests/test_schedules.py index b3f256d63540..ded2e926cf22 100644 --- a/rllib/utils/schedules/tests/test_schedules.py +++ b/rllib/utils/schedules/tests/test_schedules.py @@ -6,10 +6,9 @@ ExponentialSchedule, PiecewiseSchedule, ) -from ray.rllib.utils import check, framework_iterator, try_import_tf, try_import_torch +from ray.rllib.utils import check, try_import_torch from ray.rllib.utils.from_config import from_config -tf1, tf, tfv = try_import_tf() torch, _ = try_import_torch() @@ -22,34 +21,30 @@ def test_constant_schedule(self): config = {"value": value} - for fw in framework_iterator(frameworks=["tf2", "tf", "torch", None]): - constant = from_config(ConstantSchedule, config, framework=fw) - for t in ts: - out = constant(t) - check(out, value) + constant = from_config(ConstantSchedule, config, framework=None) + for t in ts: + out = constant(t) + check(out, value) - ts_as_tensors = self._get_framework_tensors(ts, fw) - for t in ts_as_tensors: - out = constant(t) - assert fw != "tf" or isinstance(out, tf.Tensor) - check(out, value, decimals=4) + ts_as_tensors = self._get_framework_tensors(ts, None) + for t in ts_as_tensors: + out = constant(t) + check(out, value, decimals=4) def test_linear_schedule(self): ts = [0, 50, 10, 100, 90, 2, 1, 99, 23, 1000] expected = [2.1 - (min(t, 100) / 100) * (2.1 - 0.6) for t in ts] config = {"schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6} - for fw in framework_iterator(frameworks=["tf2", "tf", "torch", None]): - linear = from_config(LinearSchedule, config, framework=fw) - for t, e in zip(ts, expected): - out = linear(t) - check(out, e, decimals=4) + linear = from_config(LinearSchedule, config, framework=None) + for t, e in zip(ts, expected): + out = linear(t) + check(out, e, decimals=4) - ts_as_tensors = self._get_framework_tensors(ts, fw) - for t, e in zip(ts_as_tensors, expected): - out = linear(t) - assert fw != "tf" or isinstance(out, tf.Tensor) - check(out, e, decimals=4) + ts_as_tensors = self._get_framework_tensors(ts, None) + for t, e in zip(ts_as_tensors, expected): + out = linear(t) + check(out, e, decimals=4) def test_polynomial_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 23, 1000] @@ -62,17 +57,15 @@ def test_polynomial_schedule(self): power=2.0, ) - for fw in framework_iterator(frameworks=["tf2", "tf", "torch", None]): - polynomial = from_config(config, framework=fw) - for t, e in zip(ts, expected): - out = polynomial(t) - check(out, e, decimals=4) + polynomial = from_config(config, framework=None) + for t, e in zip(ts, expected): + out = polynomial(t) + check(out, e, decimals=4) - ts_as_tensors = self._get_framework_tensors(ts, fw) - for t, e in zip(ts_as_tensors, expected): - out = polynomial(t) - assert fw != "tf" or isinstance(out, tf.Tensor) - check(out, e, decimals=4) + ts_as_tensors = self._get_framework_tensors(ts, None) + for t, e in zip(ts_as_tensors, expected): + out = polynomial(t) + check(out, e, decimals=4) def test_exponential_schedule(self): decay_rate = 0.2 @@ -80,17 +73,15 @@ def test_exponential_schedule(self): expected = [2.0 * decay_rate ** (t / 100) for t in ts] config = dict(initial_p=2.0, decay_rate=decay_rate, schedule_timesteps=100) - for fw in framework_iterator(frameworks=["tf2", "tf", "torch", None]): - exponential = from_config(ExponentialSchedule, config, framework=fw) - for t, e in zip(ts, expected): - out = exponential(t) - check(out, e, decimals=4) + exponential = from_config(ExponentialSchedule, config, framework=None) + for t, e in zip(ts, expected): + out = exponential(t) + check(out, e, decimals=4) - ts_as_tensors = self._get_framework_tensors(ts, fw) - for t, e in zip(ts_as_tensors, expected): - out = exponential(t) - assert fw != "tf" or isinstance(out, tf.Tensor) - check(out, e, decimals=4) + ts_as_tensors = self._get_framework_tensors(ts, None) + for t, e in zip(ts_as_tensors, expected): + out = exponential(t) + check(out, e, decimals=4) def test_piecewise_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 27] @@ -99,24 +90,20 @@ def test_piecewise_schedule(self): endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)], outside_value=14.5 ) - for fw in framework_iterator(frameworks=["tf2", "tf", "torch", None]): - piecewise = from_config(PiecewiseSchedule, config, framework=fw) - for t, e in zip(ts, expected): - out = piecewise(t) - check(out, e, decimals=4) + piecewise = from_config(PiecewiseSchedule, config, framework=None) + for t, e in zip(ts, expected): + out = piecewise(t) + check(out, e, decimals=4) - ts_as_tensors = self._get_framework_tensors(ts, fw) - for t, e in zip(ts_as_tensors, expected): - out = piecewise(t) - assert fw != "tf" or isinstance(out, tf.Tensor) - check(out, e, decimals=4) + ts_as_tensors = self._get_framework_tensors(ts, None) + for t, e in zip(ts_as_tensors, expected): + out = piecewise(t) + check(out, e, decimals=4) @staticmethod def _get_framework_tensors(ts, fw): if fw == "torch": ts = [torch.tensor(t, dtype=torch.int32) for t in ts] - elif fw is not None and "tf" in fw: - ts = [tf.constant(t, dtype=tf.int32) for t in ts] return ts diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index cea7c31543e2..26f83ff2a2bc 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -13,7 +13,6 @@ import pprint import random import re -import sys import time import tree # pip install dm_tree from typing import ( @@ -22,7 +21,6 @@ Dict, List, Optional, - Sequence, Tuple, Type, Union, @@ -62,13 +60,6 @@ jax, _ = try_import_jax() tf1, tf, tfv = try_import_tf() -if tf1: - eager_mode = None - try: - from tensorflow.python.eager.context import eager_mode - except (ImportError, ModuleNotFoundError): - pass - torch, _ = try_import_torch() logger = logging.getLogger(__name__) @@ -947,105 +938,6 @@ def check_train_results(train_results: ResultDict): return train_results -def framework_iterator( - config: Optional["AlgorithmConfig"] = None, - frameworks: Sequence[str] = ("tf2", "tf", "torch"), - session: bool = False, - time_iterations: Optional[dict] = None, -) -> Union[str, Tuple[str, Optional["tf1.Session"]]]: - """An generator that allows for looping through n frameworks for testing. - - Provides the correct config entries ("framework") as well - as the correct eager/non-eager contexts for tf/tf2. - - Args: - config: An optional config dict or AlgorithmConfig object. This will be modified - (value for "framework" changed) depending on the iteration. - frameworks: A list/tuple of the frameworks to be tested. - Allowed are: "tf2", "tf", "torch", and None. - session: If True and only in the tf-case: Enter a tf.Session() - and yield that as second return value (otherwise yield (fw, None)). - Also sets a seed (42) on the session to make the test - deterministic. - time_iterations: If provided, will write to the given dict (by - framework key) the times in seconds that each (framework's) - iteration takes. - - Yields: - If `session` is False: The current framework [tf2|tf|torch] used. - If `session` is True: A tuple consisting of the current framework - string and the tf1.Session (if fw="tf", otherwise None). - """ - config = config or {} - frameworks = [frameworks] if isinstance(frameworks, str) else list(frameworks) - - for fw in frameworks: - # Skip tf if on new API stack. - if fw == "tf" and config.get("enable_rl_module_and_learner", False): - logger.warning("Skipping `framework=tf` (new API stack configured)!") - continue - # Skip if tf/tf2 and py >= 3.11. - elif fw in ["tf", "tf2"] and ( - sys.version_info.major == 3 and sys.version_info.minor >= 9 - ): - logger.warning("Skipping `framework=tf/tf2` (python >= 3.9)!") - continue - - # Skip non-installed frameworks. - if fw == "torch" and not torch: - logger.warning("framework_iterator skipping torch (not installed)!") - continue - if fw != "torch" and not tf: - logger.warning( - "framework_iterator skipping {} (tf not installed)!".format(fw) - ) - continue - elif fw == "tf2" and tfv != 2: - logger.warning("framework_iterator skipping tf2.x (tf version is < 2.0)!") - continue - elif fw == "jax" and not jax: - logger.warning("framework_iterator skipping JAX (not installed)!") - continue - assert fw in ["tf2", "tf", "torch", "jax", None] - - # Do we need a test session? - sess = None - if fw == "tf" and session is True: - sess = tf1.Session() - sess.__enter__() - tf1.set_random_seed(42) - - if isinstance(config, dict): - config["framework"] = fw - else: - config.framework(fw) - - eager_ctx = None - # Enable eager mode for tf2. - if fw == "tf2": - eager_ctx = eager_mode() - eager_ctx.__enter__() - assert tf1.executing_eagerly() - # Make sure, eager mode is off. - elif fw == "tf": - assert not tf1.executing_eagerly() - - # Yield current framework + tf-session (if necessary). - print(f"framework={fw}") - time_started = time.time() - yield fw if session is False else (fw, sess) - if time_iterations is not None: - time_total = time.time() - time_started - time_iterations[fw] = time_total - print(f".. took {time_total}sec") - - # Exit any context we may have entered. - if eager_ctx: - eager_ctx.__exit__(None, None, None) - elif sess: - sess.__exit__(None, None, None) - - @Deprecated(new="run_learning_tests_from_yaml_or_py(config_files=...)", error=False) def run_learning_tests_from_yaml( yaml_files: List[str], @@ -1809,47 +1701,46 @@ def check_reproducibilty( ) ) - for fw in framework_iterator(algo_config, **fw_kwargs): - print( - f"Testing reproducibility of {algo_class.__name__}" - f" with {num_workers} workers on fw = {fw}" + print( + f"Testing reproducibility of {algo_class.__name__}" + f" with {num_workers} workers" + ) + print("/// config") + pprint.pprint(algo_config.to_dict()) + # test tune.Tuner().fit() reproducibility + results1 = tune.Tuner( + algo_class, + param_space=algo_config.to_dict(), + run_config=air.RunConfig(stop=stop_dict, verbose=1), + ).fit() + results1 = results1.get_best_result().metrics + + results2 = tune.Tuner( + algo_class, + param_space=algo_config.to_dict(), + run_config=air.RunConfig(stop=stop_dict, verbose=1), + ).fit() + results2 = results2.get_best_result().metrics + + # Test rollout behavior. + check( + results1[ENV_RUNNER_RESULTS]["hist_stats"], + results2[ENV_RUNNER_RESULTS]["hist_stats"], + ) + # As well as training behavior (minibatch sequence during SGD + # iterations). + # As well as training behavior (minibatch sequence during SGD + # iterations). + if algo_config.enable_rl_module_and_learner: + check( + results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID], + results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID], ) - print("/// config") - pprint.pprint(algo_config.to_dict()) - # test tune.Tuner().fit() reproducibility - results1 = tune.Tuner( - algo_class, - param_space=algo_config.to_dict(), - run_config=air.RunConfig(stop=stop_dict, verbose=1), - ).fit() - results1 = results1.get_best_result().metrics - - results2 = tune.Tuner( - algo_class, - param_space=algo_config.to_dict(), - run_config=air.RunConfig(stop=stop_dict, verbose=1), - ).fit() - results2 = results2.get_best_result().metrics - - # Test rollout behavior. + else: check( - results1[ENV_RUNNER_RESULTS]["hist_stats"], - results2[ENV_RUNNER_RESULTS]["hist_stats"], + results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID]["learner_stats"], + results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID]["learner_stats"], ) - # As well as training behavior (minibatch sequence during SGD - # iterations). - # As well as training behavior (minibatch sequence during SGD - # iterations). - if algo_config.enable_rl_module_and_learner: - check( - results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID], - results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID], - ) - else: - check( - results1["info"][LEARNER_INFO][DEFAULT_POLICY_ID]["learner_stats"], - results2["info"][LEARNER_INFO][DEFAULT_POLICY_ID]["learner_stats"], - ) def get_cartpole_dataset_reader(batch_size: int = 1) -> "DatasetReader": @@ -2030,85 +1921,80 @@ def test_ckpt_restore( if replay_buffer: config["store_buffer_in_checkpoints"] = True - frameworks = (["tf2"] if tf2 else []) + ["torch", "tf"] - for fw in framework_iterator(config, frameworks=frameworks): - env = gym.make(env_name) - alg1 = config.environment(env_name).framework(fw).build() - alg2 = config.environment(env_name).build() - - policy1 = alg1.get_policy() - - res = alg1.train() - print("current status: " + str(res)) - - # Check optimizer state as well. - optim_state = policy1.get_state().get("_optimizer_variables") - - checkpoint = alg1.save() - - # Test if we can restore multiple times (at least twice, assuming failure - # would mainly stem from improperly reused variables) - for num_restores in range(2): - # Sync the models - alg2.restore(checkpoint) - - # Compare optimizer state with re-loaded one. - if optim_state: - s2 = alg2.get_policy().get_state().get("_optimizer_variables") - # Tf -> Compare states 1:1. - if fw in ["tf2", "tf"]: - check(s2, optim_state) - # For torch, optimizers have state_dicts with keys=params, - # which are different for the two models (ignore these - # different keys, but compare all values nevertheless). - else: - for i, s2_ in enumerate(s2): - check( - list(s2_["state"].values()), - list(optim_state[i]["state"].values()), - ) + env = gym.make(env_name) + alg1 = config.environment(env_name).framework("torch").build() + alg2 = config.environment(env_name).build() - # Compare buffer content with restored one. - if replay_buffer: - data = alg1.local_replay_buffer.replay_buffers["default_policy"]._storage[ - 42 : 42 + 42 - ] - new_data = alg2.local_replay_buffer.replay_buffers[ - "default_policy" - ]._storage[42 : 42 + 42] - check(data, new_data) - - # Check, whether the eval EnvRunnerGroup has the same policies and - # `policy_mapping_fn`. - if eval_env_runner_group: - eval_mapping_src = inspect.getsource(alg1.eval_env_runner.policy_mapping_fn) - check( - eval_mapping_src, - inspect.getsource(alg2.eval_env_runner.policy_mapping_fn), - ) + policy1 = alg1.get_policy() + + res = alg1.train() + print("current status: " + str(res)) + + # Check optimizer state as well. + optim_state = policy1.get_state().get("_optimizer_variables") + + checkpoint = alg1.save() + + # Test if we can restore multiple times (at least twice, assuming failure + # would mainly stem from improperly reused variables) + for num_restores in range(2): + # Sync the models + alg2.restore(checkpoint) + + # Compare optimizer state with re-loaded one. + if optim_state: + s2 = alg2.get_policy().get_state().get("_optimizer_variables") + # Tf -> Compare states 1:1. + # For torch, optimizers have state_dicts with keys=params, + # which are different for the two models (ignore these + # different keys, but compare all values nevertheless). + for i, s2_ in enumerate(s2): check( - eval_mapping_src, - inspect.getsource(alg2.env_runner.policy_mapping_fn), - false=True, + list(s2_["state"].values()), + list(optim_state[i]["state"].values()), ) - for _ in range(1): - obs = env.observation_space.sample() - a1 = _get_mean_action_from_algorithm(alg1, obs) - a2 = _get_mean_action_from_algorithm(alg2, obs) - print("Checking computed actions", alg1, obs, a1, a2) - if abs(a1 - a2) > 0.1: - raise AssertionError( - "algo={} [a1={} a2={}]".format(str(alg1.__class__), a1, a2) - ) - # Stop algo 1. - alg1.stop() + # Compare buffer content with restored one. + if replay_buffer: + data = alg1.local_replay_buffer.replay_buffers["default_policy"]._storage[ + 42 : 42 + 42 + ] + new_data = alg2.local_replay_buffer.replay_buffers["default_policy"]._storage[ + 42 : 42 + 42 + ] + check(data, new_data) + + # Check, whether the eval EnvRunnerGroup has the same policies and + # `policy_mapping_fn`. + if eval_env_runner_group: + eval_mapping_src = inspect.getsource(alg1.eval_env_runner.policy_mapping_fn) + check( + eval_mapping_src, + inspect.getsource(alg2.eval_env_runner.policy_mapping_fn), + ) + check( + eval_mapping_src, + inspect.getsource(alg2.env_runner.policy_mapping_fn), + false=True, + ) + + for _ in range(1): + obs = env.observation_space.sample() + a1 = _get_mean_action_from_algorithm(alg1, obs) + a2 = _get_mean_action_from_algorithm(alg2, obs) + print("Checking computed actions", alg1, obs, a1, a2) + if abs(a1 - a2) > 0.1: + raise AssertionError( + "algo={} [a1={} a2={}]".format(str(alg1.__class__), a1, a2) + ) + # Stop algo 1. + alg1.stop() - if run_restored_algorithm: - # Check that algo 2 can still run. - print("Starting second run on Algo 2...") - alg2.train() - alg2.stop() + if run_restored_algorithm: + # Check that algo 2 can still run. + print("Starting second run on Algo 2...") + alg2.train() + alg2.stop() def check_supported_spaces( @@ -2137,11 +2023,8 @@ def check_supported_spaces( """ - # do these imports here because otherwise we have circular imports + # Do these imports here because otherwise we have circular imports. from ray.rllib.examples.envs.classes.random_env import RandomEnv - from ray.rllib.models.tf.complex_input_net import ComplexInputNetwork as ComplexNet - from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as FCNet - from ray.rllib.models.tf.visionnet import VisionNetwork as VisionNet from ray.rllib.models.torch.complex_input_net import ( ComplexInputNetwork as TorchComplexNet, ) @@ -2193,8 +2076,6 @@ def check_supported_spaces( "dict", ] - rlmodule_supported_frameworks = ("torch", "tf2") - # The action spaces that we test RLModules with rlmodule_supported_action_spaces = ["discrete", "continuous"] @@ -2261,25 +2142,16 @@ def _do_check(alg, config, a_name, o_name): if alg not in ["SAC", "PPO"]: # 2D (image) input: Expect VisionNet. if o_name in ["atari", "image"]: - if fw == "torch": - assert isinstance(algo.get_policy().model, TorchVisionNet) - else: - assert isinstance(algo.get_policy().model, VisionNet) + assert isinstance(algo.get_policy().model, TorchVisionNet) # 1D input: Expect FCNet. elif o_name == "continuous": - if fw == "torch": - assert isinstance(algo.get_policy().model, TorchFCNet) - else: - assert isinstance(algo.get_policy().model, FCNet) + assert isinstance(algo.get_policy().model, TorchFCNet) # Could be either one: ComplexNet (if disabled Preprocessor) # or FCNet (w/ Preprocessor). elif o_name == "vector2d": - if fw == "torch": - assert isinstance( - algo.get_policy().model, (TorchComplexNet, TorchFCNet) - ) - else: - assert isinstance(algo.get_policy().model, (ComplexNet, FCNet)) + assert isinstance( + algo.get_policy().model, (TorchComplexNet, TorchFCNet) + ) if train: algo.train() algo.stop() @@ -2288,21 +2160,14 @@ def _do_check(alg, config, a_name, o_name): if not frameworks: frameworks = ("tf2", "tf", "torch") - if config.enable_rl_module_and_learner: - # Only test the frameworks that are supported by RLModules. - frameworks = tuple( - fw for fw in frameworks if fw in rlmodule_supported_frameworks - ) - _do_check_remote = ray.remote(_do_check) _do_check_remote = _do_check_remote.options(num_gpus=1 if use_gpu else 0) - for _ in framework_iterator(config, frameworks=frameworks): - # Test all action spaces first. - for a_name in action_spaces_to_test.keys(): - o_name = default_observation_space - ray.get(_do_check_remote.remote(alg, config, a_name, o_name)) - - # Now test all observation spaces. - for o_name in observation_spaces_to_test.keys(): - a_name = default_action_space - ray.get(_do_check_remote.remote(alg, config, a_name, o_name)) + # Test all action spaces first. + for a_name in action_spaces_to_test.keys(): + o_name = default_observation_space + ray.get(_do_check_remote.remote(alg, config, a_name, o_name)) + + # Now test all observation spaces. + for o_name in observation_spaces_to_test.keys(): + a_name = default_action_space + ray.get(_do_check_remote.remote(alg, config, a_name, o_name)) diff --git a/rllib/utils/tests/test_errors.py b/rllib/utils/tests/test_errors.py index 2c7cca9a19f9..49410cede5bf 100644 --- a/rllib/utils/tests/test_errors.py +++ b/rllib/utils/tests/test_errors.py @@ -4,7 +4,6 @@ import ray.rllib.algorithms.impala as impala import ray.rllib.algorithms.ppo as ppo from ray.rllib.utils.error import EnvError -from ray.rllib.utils.test_utils import framework_iterator class TestErrors(unittest.TestCase): @@ -26,13 +25,12 @@ def test_no_gpus_error(self): config = impala.IMPALAConfig().environment("CartPole-v1") - for _ in framework_iterator(config): - self.assertRaisesRegex( - RuntimeError, - # (?s): "dot matches all" (also newlines). - "(?s)Found 0 GPUs on your machine.+To change the config", - lambda: config.build(), - ) + self.assertRaisesRegex( + RuntimeError, + # (?s): "dot matches all" (also newlines). + "(?s)Found 0 GPUs on your machine.+To change the config", + lambda: config.build(), + ) def test_bad_envs(self): """Tests different "bad env" errors.""" @@ -42,41 +40,37 @@ def test_bad_envs(self): .environment("Alien-Attack-v42") ) - for _ in framework_iterator(config): - self.assertRaisesRegex( - EnvError, - f"The env string you provided \\('{config.env}'\\) is", - lambda: config.build(), - ) + self.assertRaisesRegex( + EnvError, + f"The env string you provided \\('{config.env}'\\) is", + lambda: config.build(), + ) # Malformed gym env string (must have v\d at end). config.environment("Alien-Attack-part-42") - for _ in framework_iterator(config): - self.assertRaisesRegex( - EnvError, - f"The env string you provided \\('{config.env}'\\) is", - lambda: config.build(), - ) + self.assertRaisesRegex( + EnvError, + f"The env string you provided \\('{config.env}'\\) is", + lambda: config.build(), + ) # Non-existing class in a full-class-path. config.environment( "ray.rllib.examples.envs.classes.random_env.RandomEnvThatDoesntExist" ) - for _ in framework_iterator(config): - self.assertRaisesRegex( - EnvError, - f"The env string you provided \\('{config.env}'\\) is", - lambda: config.build(), - ) + self.assertRaisesRegex( + EnvError, + f"The env string you provided \\('{config.env}'\\) is", + lambda: config.build(), + ) # Non-existing module inside a full-class-path. config.environment("ray.rllib.examples.envs.module_that_doesnt_exist.SomeEnv") - for _ in framework_iterator(config): - self.assertRaisesRegex( - EnvError, - f"The env string you provided \\('{config.env}'\\) is", - lambda: config.build(), - ) + self.assertRaisesRegex( + EnvError, + f"The env string you provided \\('{config.env}'\\) is", + lambda: config.build(), + ) if __name__ == "__main__": diff --git a/rllib/utils/tests/test_framework_agnostic_components.py b/rllib/utils/tests/test_framework_agnostic_components.py index 9a50262d79c9..743f30c5c790 100644 --- a/rllib/utils/tests/test_framework_agnostic_components.py +++ b/rllib/utils/tests/test_framework_agnostic_components.py @@ -7,7 +7,7 @@ from ray.rllib.utils.exploration.exploration import Exploration from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.from_config import from_config -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import check tf1, tf, tfv = try_import_tf() torch, _ = try_import_torch() @@ -63,102 +63,88 @@ def test_dummy_components(self): script_dir = Path(__file__).parent abs_path = script_dir.absolute() - for fw, sess in framework_iterator(session=True): - # Try to create from an abstract class w/o default constructor. - # Expect None. - test = from_config({"type": AbstractDummyComponent, "framework": fw}) - check(test, None) - - # Create a Component via python API (config dict). - component = from_config( - dict( - type=DummyComponent, prop_a=1.0, prop_d="non_default", framework=fw - ) - ) - check(component.prop_d, "non_default") - - # Create a tf Component from json file. - config_file = str(abs_path.joinpath("dummy_config.json")) - component = from_config(config_file, framework=fw) - check(component.prop_c, "default") - check(component.prop_d, 4) # default - value = component.add(3.3) - if sess: - value = sess.run(value) - check(value, 5.3) # prop_b == 2.0 - - # Create a torch Component from yaml file. - config_file = str(abs_path.joinpath("dummy_config.yml")) - component = from_config(config_file, framework=fw) - check(component.prop_a, "something else") - check(component.prop_d, 3) - value = component.add(1.2) - if sess: - value = sess.run(value) - check(value, np.array([2.2])) # prop_b == 1.0 - - # Create tf Component from json-string (e.g. on command line). - component = from_config( - '{"type": "ray.rllib.utils.tests.' - 'test_framework_agnostic_components.DummyComponent", ' - '"prop_a": "A", "prop_b": -1.0, "prop_c": "non-default", ' - '"framework": "' + fw + '"}' - ) - check(component.prop_a, "A") - check(component.prop_d, 4) # default - value = component.add(-1.1) - if sess: - value = sess.run(value) - check(value, -2.1) # prop_b == -1.0 - - # Test recognizing default module path. - component = from_config( - DummyComponent, - '{"type": "NonAbstractChildOfDummyComponent", ' - '"prop_a": "A", "prop_b": -1.0, "prop_c": "non-default",' - '"framework": "' + fw + '"}', - ) - check(component.prop_a, "A") - check(component.prop_d, 4) # default - value = component.add(-1.1) - if sess: - value = sess.run(value) - check(value, -2.1) # prop_b == -1.0 - - # Test recognizing default package path. - scope = None - if sess: - scope = tf1.variable_scope("exploration_object") - scope.__enter__() - component = from_config( - Exploration, - { - "type": "EpsilonGreedy", - "action_space": Discrete(2), - "framework": fw, - "num_workers": 0, - "worker_index": 0, - "policy_config": {}, - "model": None, - }, - ) - if scope: - scope.__exit__(None, None, None) - check(component.epsilon_schedule.outside_value, 0.05) # default - - # Create torch Component from yaml-string. - component = from_config( - "type: ray.rllib.utils.tests." - "test_framework_agnostic_components.DummyComponent\n" - "prop_a: B\nprop_b: -1.5\nprop_c: non-default\nframework: " - "{}".format(fw) + # Try to create from an abstract class w/o default constructor. + # Expect None. + test = from_config({"type": AbstractDummyComponent, "framework": "torch"}) + check(test, None) + + # Create a Component via python API (config dict). + component = from_config( + dict( + type=DummyComponent, prop_a=1.0, prop_d="non_default", framework="torch" ) - check(component.prop_a, "B") - check(component.prop_d, 4) # default - value = component.add(-5.1) - if sess: - value = sess.run(value) - check(value, np.array([-6.6])) # prop_b == -1.5 + ) + check(component.prop_d, "non_default") + + # Create a tf Component from json file. + config_file = str(abs_path.joinpath("dummy_config.json")) + component = from_config(config_file, framework="torch") + check(component.prop_c, "default") + check(component.prop_d, 4) # default + value = component.add(3.3) + check(value, 5.3) # prop_b == 2.0 + + # Create a torch Component from yaml file. + config_file = str(abs_path.joinpath("dummy_config.yml")) + component = from_config(config_file, framework="torch") + check(component.prop_a, "something else") + check(component.prop_d, 3) + value = component.add(1.2) + check(value, np.array([2.2])) # prop_b == 1.0 + + # Create tf Component from json-string (e.g. on command line). + component = from_config( + '{"type": "ray.rllib.utils.tests.' + 'test_framework_agnostic_components.DummyComponent", ' + '"prop_a": "A", "prop_b": -1.0, "prop_c": "non-default", ' + '"framework": "' + "torch" + '"}' + ) + check(component.prop_a, "A") + check(component.prop_d, 4) # default + value = component.add(-1.1) + check(value, -2.1) # prop_b == -1.0 + + # Test recognizing default module path. + component = from_config( + DummyComponent, + '{"type": "NonAbstractChildOfDummyComponent", ' + '"prop_a": "A", "prop_b": -1.0, "prop_c": "non-default",' + '"framework": "torch"}', + ) + check(component.prop_a, "A") + check(component.prop_d, 4) # default + value = component.add(-1.1) + check(value, -2.1) # prop_b == -1.0 + + # Test recognizing default package path. + scope = None + component = from_config( + Exploration, + { + "type": "EpsilonGreedy", + "action_space": Discrete(2), + "framework": "torch", + "num_workers": 0, + "worker_index": 0, + "policy_config": {}, + "model": None, + }, + ) + if scope: + scope.__exit__(None, None, None) + check(component.epsilon_schedule.outside_value, 0.05) # default + + # Create torch Component from yaml-string. + component = from_config( + "type: ray.rllib.utils.tests." + "test_framework_agnostic_components.DummyComponent\n" + "prop_a: B\nprop_b: -1.5\nprop_c: non-default\nframework: " + "torch" + ) + check(component.prop_a, "B") + check(component.prop_d, 4) # default + value = component.add(-5.1) + check(value, np.array([-6.6])) # prop_b == -1.5 def test_unregistered_envs(self): """Tests, whether an Env can be specified simply by its absolute class."""