huggingface · dawidm · Dec 23, 2024 · Jan 8, 2025 · Jan 8, 2025
diff --git a/docs/source/rloo_trainer.md b/docs/source/rloo_trainer.md
@@ -52,7 +52,7 @@ The logged metrics are as follows. Here is an example [tracked run at Weights an
 * `val/ratio_var`: The variance of the `val/ratio`, indicating the variability in policy changes.
 * `val/num_eos_tokens`: The number of end-of-sequence (EOS) tokens generated, which can indicate the number of complete responses.
 * `lr`: lr: The current learning rate used by the optimizer.
-* `episode`: episode: The current global step or episode count in the training process.
+* `episode`: episode: The current episode count in the training process.
 
 
 ## Cookbook

diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py
@@ -120,12 +120,12 @@ def __init__(
         # calculate various batch sizes
         #########
         if args.total_episodes is None:  # allow the users to define episodes in terms of epochs.
-            args.total_episodes = int(args.num_train_epochs * self.train_dataset_len)
+            args.total_episodes = int(args.num_train_epochs * self.train_dataset_len * args.rloo_k)
         accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps)
         self.accelerator = accelerator
         args.world_size = accelerator.num_processes
         args.local_batch_size = (
-            args.per_device_train_batch_size * args.gradient_accumulation_steps * args.num_mini_batches
+            args.per_device_train_batch_size * args.gradient_accumulation_steps
         )
         args.micro_batch_size = int(args.per_device_train_batch_size * args.world_size)
         args.batch_size = int(args.local_batch_size * args.world_size)
@@ -275,8 +275,8 @@ def repeat_generator():
         # trainer state initialization
         self.state.global_step = 0
         self.state.episode = 0
-        self.state.max_steps = (args.num_total_batches * args.num_mini_batches) // 2
-        self.state.num_train_epochs = args.total_episodes / self.train_dataset_len
+        self.state.max_steps = args.num_total_batches
+        self.state.num_train_epochs = (args.total_episodes / args.rloo_k) / self.train_dataset_len
         # Compute absolute values for logging, eval, and save if given as ratio
         if args.logging_steps is not None:
             if args.logging_steps < 1: