From ed4031e2b9da445c634f10bdaa79f8f5c95ffcd2 Mon Sep 17 00:00:00 2001
From: riffmaster <173845832+riffmaster-2001@users.noreply.github.com>
Date: Mon, 26 Aug 2024 21:54:01 -0700
Subject: [PATCH 1/3] fixing typo in flux document for
 preserve_data_backend_cache key

---
 documentation/DATALOADER.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/DATALOADER.md b/documentation/DATALOADER.md
index dcb9740a..2c5ddc89 100644
--- a/documentation/DATALOADER.md
+++ b/documentation/DATALOADER.md
@@ -203,7 +203,7 @@ Images are not resized before cropping **unless** `maximum_image_size` and `targ
 - This is equivalent to the commandline option `--skip_file_discovery`
 - This is helpful if you have datasets you don't need the trainer to scan on every startup, eg. their latents/embeds are already cached fully. This allows quicker startup and resumption of training.
 
-### `preserve_data_cache_backend`
+### `preserve_data_backend_cache`
 
 - You probably don't want to ever set this - it is useful only for very large AWS datasets.
 - Like `skip_file_discovery`, this option can be set to prevent unnecessary, lengthy and costly filesystem scans at startup.

From a62afcabc58d57dcf53cd00aec5cbf06938b504c Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Tue, 27 Aug 2024 08:26:44 -0600
Subject: [PATCH 2/3] cosineannealinghardrestarts needs last_step set for newer
 accelerate preparation

---
 helpers/training/custom_schedule.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/helpers/training/custom_schedule.py b/helpers/training/custom_schedule.py
index 63ebb4d4..a2d6fe3b 100644
--- a/helpers/training/custom_schedule.py
+++ b/helpers/training/custom_schedule.py
@@ -369,6 +369,7 @@ def __init__(
         self.T_mult = T_mult
         self.eta_min = eta_min
         self.T_cur = last_step
+        self.last_step = last_step
         super().__init__(optimizer, last_step, verbose)
 
     def get_lr(self):

From cebac83bfb7202e9804a1d712cac3bb6d363855a Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Tue, 27 Aug 2024 11:50:59 -0600
Subject: [PATCH 3/3] reintroduce timestep dependent shift as an option during
 flux training for dev and schnell, disabled by default

---
 helpers/arguments.py | 13 +++++++++++++
 train.py             |  7 +++++++
 2 files changed, 20 insertions(+)

diff --git a/helpers/arguments.py b/helpers/arguments.py
index 7a2697f8..b2182c13 100644
--- a/helpers/arguments.py
+++ b/helpers/arguments.py
@@ -140,6 +140,19 @@ def parse_args(input_args=None):
             " which has improved results in short experiments. Thanks to @mhirki for the contribution."
         ),
     )
+    parser.add_argument(
+        "--flux_schedule_shift",
+        type=float,
+        default=None,
+        help=(
+            "Shift the noise schedule. This is a value between 0 and ~4.0, where 0 disables the timestep-dependent shift,"
+            " and anything greater than 0 will shift the timestep sampling accordingly. The SD3 model was trained with"
+            " a shift value of 3. The value for Flux is unknown. Higher values result in less noisy timesteps sampled,"
+            " which results in a lower mean loss value, but not necessarily better results. Early reports indicate"
+            " that modification of this value can change how the contrast is learnt by the model, and whether fine"
+            " details are ignored or accentuated."
+        ),
+    )
     parser.add_argument(
         "--flux_guidance_mode",
         type=str,
diff --git a/train.py b/train.py
index ff58ac0c..07dc478c 100644
--- a/train.py
+++ b/train.py
@@ -1670,6 +1670,13 @@ def main():
                             device=accelerator.device,
                         )
                     timesteps = sigmas * 1000.0
+                    if (
+                        args.flux_schedule_shift is not None
+                        and args.flux_schedule_shift > 0
+                    ):
+                        timesteps = (timesteps * args.flux_schedule_shift) / (
+                            1 + (args.flux_schedule_shift - 1) * timesteps
+                        )
                     sigmas = sigmas.view(-1, 1, 1, 1)
                 else:
                     # Sample a random timestep for each image, potentially biased by the timestep weights.