Farama-Foundation · nicku-a · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/tutorials/AgileRL/agilerl_dqn_curriculum.py b/tutorials/AgileRL/agilerl_dqn_curriculum.py
diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py
@@ -2,6 +2,7 @@
 
 Authors: Michael (https://github.com/mikepratt1), Nick (https://github.com/nicku-a)
 """
+
 import os
 
 import numpy as np
@@ -10,14 +11,14 @@
 from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
 from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
-from agilerl.utils.utils import initialPopulation
+from agilerl.utils.utils import create_population
+from agilerl.wrappers.pettingzoo_wrappers import PettingZooVectorizationParallelWrapper
 from tqdm import trange
 
 from pettingzoo.atari import space_invaders_v2
 
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print("===== AgileRL MADDPG Demo =====")
 
     # Define the network configuration
     NET_CONFIG = {
@@ -35,15 +36,21 @@
         "ALGO": "MADDPG",  # Algorithm
         # Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
         "CHANNELS_LAST": True,
-        "BATCH_SIZE": 8,  # Batch size
+        "BATCH_SIZE": 32,  # Batch size
+        "O_U_NOISE": True,  # Ornstein Uhlenbeck action noise
+        "EXPL_NOISE": 0.1,  # Action noise scale
+        "MEAN_NOISE": 0.0,  # Mean action noise
+        "THETA": 0.15,  # Rate of mean reversion in OU noise
+        "DT": 0.01,  # Timestep for OU noise
         "LR_ACTOR": 0.001,  # Actor learning rate
-        "LR_CRITIC": 0.01,  # Critic learning rate
+        "LR_CRITIC": 0.001,  # Critic learning rate
         "GAMMA": 0.95,  # Discount factor
-        "MEMORY_SIZE": 10000,  # Max memory buffer size
-        "LEARN_STEP": 5,  # Learning frequency
+        "MEMORY_SIZE": 100000,  # Max memory buffer size
+        "LEARN_STEP": 100,  # Learning frequency
         "TAU": 0.01,  # For soft update of target parameters
     }
 
+    num_envs = 8
     # Define the space invaders environment as a parallel environment
     env = space_invaders_v2.parallel_env()
     if INIT_HP["CHANNELS_LAST"]:
@@ -53,6 +60,7 @@
         env = ss.color_reduction_v0(env, mode="B")
         env = ss.resize_v1(env, x_size=84, y_size=84)
         env = ss.frame_stack_v1(env, 4)
+    env = PettingZooVectorizationParallelWrapper(env, n_envs=num_envs)
     env.reset()
 
     # Configure the multi-agent algo input arguments
@@ -84,14 +92,15 @@
     INIT_HP["AGENT_IDS"] = env.agents
 
     # Create a population ready for evolutionary hyper-parameter optimisation
-    pop = initialPopulation(
+    pop = create_population(
         INIT_HP["ALGO"],
         state_dim,
         action_dim,
         one_hot,
         NET_CONFIG,
         INIT_HP,
         population_size=INIT_HP["POPULATION_SIZE"],
+        num_envs=num_envs,
         device=device,
     )
 
@@ -109,8 +118,8 @@
         tournament_size=2,  # Tournament selection size
         elitism=True,  # Elitism in tournament selection
         population_size=INIT_HP["POPULATION_SIZE"],  # Population size
-        evo_step=1,
-    )  # Evaluate using last N fitness scores
+        eval_loop=1,  # Evaluate using last N fitness scores
+    )
 
     # Instantiate a mutations object (used for HPO)
     mutations = Mutations(
@@ -128,7 +137,7 @@
         ],  # RL hyperparams selected for mutation
         mutation_sd=0.1,  # Mutation strength
         # Define search space for each hyperparameter
-        min_lr=0.0001,
+        min_lr=0.00001,
         max_lr=0.01,
         min_learn_step=1,
         max_learn_step=120,
@@ -141,26 +150,32 @@
     )
 
     # Define training loop parameters
-    max_episodes = 5  # Total episodes (default: 6000)
-    max_steps = 900  # Maximum steps to take in each episode
-    epsilon = 1.0  # Starting epsilon value
-    eps_end = 0.1  # Final epsilon value
-    eps_decay = 0.995  # Epsilon decay
-    evo_epochs = 20  # Evolution frequency
-    evo_loop = 1  # Number of evaluation episodes
+    max_steps = 4500  # Max steps (default: 2000000)
+    learning_delay = 500  # Steps before starting learning
+    evo_steps = 10000  # Evolution frequency
+    eval_steps = None  # Evaluation steps per episode - go until done
+    eval_loop = 1  # Number of evaluation episodes
     elite = pop[0]  # Assign a placeholder "elite" agent
 
-    # Training loop
-    for idx_epi in trange(max_episodes):
+    total_steps = 0
+
+    # TRAINING LOOP
+    print("Training...")
+    pbar = trange(max_steps, unit="step")
+    while np.less([agent.steps[-1] for agent in pop], max_steps).all():
+        pop_episode_scores = []
         for agent in pop:  # Loop through population
             state, info = env.reset()  # Reset environment at start of episode
-            agent_reward = {agent_id: 0 for agent_id in env.agents}
+            scores = np.zeros(num_envs)
+            completed_episode_scores = []
+            steps = 0
             if INIT_HP["CHANNELS_LAST"]:
                 state = {
-                    agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
+                    agent_id: np.moveaxis(s, [-1], [-3])
                     for agent_id, s in state.items()
                 }
-            for _ in range(max_steps):
+
+            for idx_step in range(evo_steps // num_envs):
                 agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
                 env_defined_actions = (
                     info["env_defined_actions"]
@@ -169,87 +184,124 @@
                 )
 
                 # Get next action from agent
-                cont_actions, discrete_action = agent.getAction(
-                    state, epsilon, agent_mask, env_defined_actions
+                cont_actions, discrete_action = agent.get_action(
+                    states=state,
+                    training=True,
+                    agent_mask=agent_mask,
+                    env_defined_actions=env_defined_actions,
                 )
                 if agent.discrete_actions:
                     action = discrete_action
                 else:
                     action = cont_actions
 
-                next_state, reward, termination, truncation, info = env.step(
-                    action
-                )  # Act in environment
+                # Act in environment
+                next_state, reward, termination, truncation, info = env.step(action)
+
+                scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1)
+                total_steps += num_envs
+                steps += num_envs
 
                 # Image processing if necessary for the environment
                 if INIT_HP["CHANNELS_LAST"]:
-                    state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
                     next_state = {
                         agent_id: np.moveaxis(ns, [-1], [-3])
                         for agent_id, ns in next_state.items()
                     }
 
                 # Save experiences to replay buffer
-                memory.save2memory(state, cont_actions, reward, next_state, termination)
-
-                # Collect the reward
-                for agent_id, r in reward.items():
-                    agent_reward[agent_id] += r
+                memory.save_to_memory(
+                    state,
+                    cont_actions,
+                    reward,
+                    next_state,
+                    termination,
+                    is_vectorised=True,
+                )
 
                 # Learn according to learning frequency
-                if (memory.counter % agent.learn_step == 0) and (
-                    len(memory) >= agent.batch_size
+                # Handle learn steps > num_envs
+                if agent.learn_step > num_envs:
+                    learn_step = agent.learn_step // num_envs
+                    if (
+                        idx_step % learn_step == 0
+                        and len(memory) >= agent.batch_size
+                        and memory.counter > learning_delay
+                    ):
+                        # Sample replay buffer
+                        experiences = memory.sample(agent.batch_size)
+                        # Learn according to agent's RL algorithm
+                        agent.learn(experiences)
+                # Handle num_envs > learn step; learn multiple times per step in env
+                elif (
+                    len(memory) >= agent.batch_size and memory.counter > learning_delay
                 ):
-                    experiences = memory.sample(
-                        agent.batch_size
-                    )  # Sample replay buffer
-                    agent.learn(experiences)  # Learn according to agent's RL algorithm
+                    for _ in range(num_envs // agent.learn_step):
+                        # Sample replay buffer
+                        experiences = memory.sample(agent.batch_size)
+                        # Learn according to agent's RL algorithm
+                        agent.learn(experiences)
 
-                # Update the state
-                if INIT_HP["CHANNELS_LAST"]:
-                    next_state = {
-                        agent_id: np.expand_dims(ns, 0)
-                        for agent_id, ns in next_state.items()
-                    }
                 state = next_state
 
-                # Stop episode if any agents have terminated
-                if any(truncation.values()) or any(termination.values()):
-                    break
-
-            # Save the total episode reward
-            score = sum(agent_reward.values())
-            agent.scores.append(score)
-
-        # Update epsilon for exploration
-        epsilon = max(eps_end, epsilon * eps_decay)
-
-        # Now evolve population if necessary
-        if (idx_epi + 1) % evo_epochs == 0:
-            # Evaluate population
-            fitnesses = [
-                agent.test(
-                    env,
-                    swap_channels=INIT_HP["CHANNELS_LAST"],
-                    max_steps=max_steps,
-                    loop=evo_loop,
-                )
-                for agent in pop
-            ]
+                # Calculate scores and reset noise for finished episodes
+                reset_noise_indices = []
+                term_array = np.array(list(termination.values())).transpose()
+                trunc_array = np.array(list(truncation.values())).transpose()
+                for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
+                    if np.any(d) or np.any(t):
+                        completed_episode_scores.append(scores[idx])
+                        agent.scores.append(scores[idx])
+                        scores[idx] = 0
+                        reset_noise_indices.append(idx)
+                agent.reset_action_noise(reset_noise_indices)
+
+            pbar.update(evo_steps // len(pop))
 
-            print(f"Episode {idx_epi + 1}/{max_episodes}")
-            print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
-            print(
-                f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
+            agent.steps[-1] += steps
+            pop_episode_scores.append(completed_episode_scores)
+
+        # Evaluate population
+        fitnesses = [
+            agent.test(
+                env,
+                swap_channels=INIT_HP["CHANNELS_LAST"],
+                max_steps=eval_steps,
+                loop=eval_loop,
+            )
+            for agent in pop
+        ]
+        mean_scores = [
+            (
+                np.mean(episode_scores)
+                if len(episode_scores) > 0
+                else "0 completed episodes"
             )
+            for episode_scores in pop_episode_scores
+        ]
+
+        print(f"--- Global steps {total_steps} ---")
+        print(f"Steps {[agent.steps[-1] for agent in pop]}")
+        print(f"Scores: {mean_scores}")
+        print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
+        print(
+            f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}'
+        )
 
-            # Tournament selection and population mutation
-            elite, pop = tournament.select(pop)
-            pop = mutations.mutation(pop)
+        # Tournament selection and population mutation
+        elite, pop = tournament.select(pop)
+        pop = mutations.mutation(pop)
+
+        # Update step counter
+        for agent in pop:
+            agent.steps.append(agent.steps[-1])
 
     # Save the trained algorithm
     path = "./models/MADDPG"
     filename = "MADDPG_trained_agent.pt"
     os.makedirs(path, exist_ok=True)
     save_path = os.path.join(path, filename)
-    elite.saveCheckpoint(save_path)
+    elite.save_checkpoint(save_path)
+
+    pbar.close()
+    env.close()