From dfe08f395fb7860e5b888f37231ad8564852b95a Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <125134079+BootsofLagrangian@users.noreply.github.com>
Date: Sun, 4 Feb 2024 03:12:42 +0900
Subject: [PATCH 001/132] support deepspeed

---
 fine_tune.py          | 41 ++++++++++++++++++++-------
 library/train_util.py | 54 +++++++++++++++++++++++++++++++++++
 sdxl_train.py         | 45 ++++++++++++++++++++---------
 train_db.py           | 37 ++++++++++++++++++------
 train_network.py      | 66 +++++++++++++++++++++++++++++++------------
 5 files changed, 194 insertions(+), 49 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 982dc8aec..78dfd1696 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -102,6 +102,7 @@ def train(args):
 
     # mixed precisionに対応した型を用意しておき適宜castする
     weight_dtype, save_dtype = train_util.prepare_dtype(args)
+    vae_dtype = torch.float32 if args.no_half_vae else weight_dtype
 
     # モデルを読み込む
     text_encoder, vae, unet, load_stable_diffusion_format = train_util.load_target_model(args, weight_dtype, accelerator)
@@ -152,7 +153,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     # 学習を準備する
     if cache_latents:
-        vae.to(accelerator.device, dtype=weight_dtype)
+        vae.to(accelerator.device, dtype=vae_dtype)
         vae.requires_grad_(False)
         vae.eval()
         with torch.no_grad():
@@ -187,7 +188,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     if not cache_latents:
         vae.requires_grad_(False)
         vae.eval()
-        vae.to(accelerator.device, dtype=weight_dtype)
+        vae.to(accelerator.device, dtype=vae_dtype)
 
     for m in training_models:
         m.requires_grad_(True)
@@ -214,7 +215,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         batch_size=1,
         shuffle=True,
         collate_fn=collator,
-        num_workers=n_workers,
+        num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
         persistent_workers=args.persistent_data_loader_workers,
     )
 
@@ -240,13 +241,33 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         unet.to(weight_dtype)
         text_encoder.to(weight_dtype)
 
-    # acceleratorがなんかよろしくやってくれるらしい
-    if args.train_text_encoder:
-        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
-        )
-    else:
-        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
+    if args.deepspeed:
+        # wrapping model
+        class DeepSpeedModel(torch.nn.Module): 
+            def __init__(self, unet, text_encoder, vae) -> None:
+                super().__init__()
+                self.unet = unet
+                self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
+                self.vae = vae
+            def get_models(self):
+                return self.unet, self.text_encoders, self.vae
+            
+        unet.to(accelerator.device, dtype=weight_dtype)
+        [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders]
+        ds_model = DeepSpeedModel(unet, text_encoders, vae)
+        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
+        # Now, ds_model is an instance of DeepSpeedEngine. 
+        unet, text_encoders, vae = ds_model.get_models() # for compatiblility
+        vae.to(vae_dtype)
+        text_encoder = text_encoders
+            
+    else: # acceleratorがなんかよろしくやってくれるらしい
+        if args.train_text_encoder:
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+            )
+        else:
+            unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
     if args.full_fp16:
diff --git a/library/train_util.py b/library/train_util.py
index ba428e508..2d85c9776 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -20,6 +20,7 @@
     Union,
 )
 from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs
+from accelerate import DeepSpeedPlugin
 import gc
 import glob
 import math
@@ -3124,6 +3125,47 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
             "--prior_loss_weight", type=float, default=1.0, help="loss weight for regularization images / 正則化画像のlossの重み"
         )
 
+    # DeepSpeed Arguments. https://huggingface.co/docs/accelerate/usage_guides/deepspeed
+    parser.add_argument("--deepspeed", action="store_true", help="enable deepspeed training")
+    parser.add_argument(
+        "--zero_stage", 
+        type=int, default=2,
+        choices=[0, 1, 2, 3],
+        help="Possible options are 0,1,2,3."
+    )
+    parser.add_argument(
+        "--offload_optimizer", 
+        type=str, default=None,
+        choices=[None, "cpu", "nvme"],
+        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3."
+    )
+    parser.add_argument(
+        "--offload_optimizer_nvme_path",
+        type=str, default=None,
+        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3."
+    )
+    parser.add_argument(
+        "--offload_param_device",
+        type=str, default=None,
+        choices=[None, "cpu", "nvme"],
+        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3."
+    )
+    parser.add_argument(
+        "--offload_param_nvme_path",
+        type=str, default=None,
+        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3."
+    )
+    parser.add_argument(
+        "--zero3_init_flag",
+        action="store_true",
+        help="Flag to indicate whether to enable `deepspeed.zero.Init` for constructing massive models."
+            "Only applicable with ZeRO Stage-3."
+    )
+    parser.add_argument(
+        "--zero3_save_16bit_model",
+        action="store_true",
+        help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3."
+    )
 
 def verify_training_args(args: argparse.Namespace):
     if args.v_parameterization and not args.v2:
@@ -3912,6 +3954,17 @@ def prepare_accelerator(args: argparse.Namespace):
         else None,
     )
     kwargs_handlers = list(filter(lambda x: x is not None, kwargs_handlers))
+    deepspeed_plugin = None
+    if args.deepspeed:
+        deepspeed_plugin = DeepSpeedPlugin(
+            zero_stage=args.zero_stage,
+            gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_clipping=args.max_grad_norm,
+            offload_optimizer=args.offload_optimizer, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path,
+            offload_param_device=args.offload_param_device, offload_param_nvme_path=args.offload_param_nvme_path,
+            zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model,
+        )
+        deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size
+
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         mixed_precision=args.mixed_precision,
@@ -3919,6 +3972,7 @@ def prepare_accelerator(args: argparse.Namespace):
         project_dir=logging_dir,
         kwargs_handlers=kwargs_handlers,
         dynamo_backend=dynamo_backend,
+        deepspeed_plugin=deepspeed_plugin,
     )
     return accelerator
 
diff --git a/sdxl_train.py b/sdxl_train.py
index a3f6f3a17..6ce6c201e 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -354,7 +354,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         batch_size=1,
         shuffle=True,
         collate_fn=collator,
-        num_workers=n_workers,
+        num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
         persistent_workers=args.persistent_data_loader_workers,
     )
 
@@ -389,18 +389,37 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         text_encoder1.to(weight_dtype)
         text_encoder2.to(weight_dtype)
 
-    # acceleratorがなんかよろしくやってくれるらしい
-    if train_unet:
-        unet = accelerator.prepare(unet)
-    if train_text_encoder1:
-        # freeze last layer and final_layer_norm in te1 since we use the output of the penultimate layer
-        text_encoder1.text_model.encoder.layers[-1].requires_grad_(False)
-        text_encoder1.text_model.final_layer_norm.requires_grad_(False)
-        text_encoder1 = accelerator.prepare(text_encoder1)
-    if train_text_encoder2:
-        text_encoder2 = accelerator.prepare(text_encoder2)
-
-    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+    if args.deepspeed:
+        # Wrapping model for DeepSpeed
+        class DeepSpeedModel(torch.nn.Module): 
+            def __init__(self, unet, text_encoder, vae) -> None:
+                super().__init__()
+                self.unet = unet
+                self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
+                self.vae = vae
+                
+            def get_models(self):
+                return self.unet, self.text_encoders, self.vae
+        text_encoders = [text_encoder1, text_encoder2]
+        unet.to(accelerator.device, dtype=weight_dtype)
+        [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders]
+        ds_model = DeepSpeedModel(unet, text_encoders, vae)
+        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
+        # Now, ds_model is an instance of DeepSpeedEngine. 
+        unet, text_encoders, vae = ds_model.get_models() # for compatiblility
+        vae.to(vae_dtype) # to avoid explicitly half-vae
+        text_encoder1, text_encoder2 = text_encoders[0], text_encoders[1]
+    else: # acceleratorがなんかよろしくやってくれるらしい
+        if train_unet:
+            unet = accelerator.prepare(unet)
+        if train_text_encoder1:
+            # freeze last layer and final_layer_norm in te1 since we use the output of the penultimate layer
+            text_encoder1.text_model.encoder.layers[-1].requires_grad_(False)
+            text_encoder1.text_model.final_layer_norm.requires_grad_(False)
+            text_encoder1 = accelerator.prepare(text_encoder1)
+        if train_text_encoder2:
+            text_encoder2 = accelerator.prepare(text_encoder2)
+        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
     # TextEncoderの出力をキャッシュするときにはCPUへ移動する
     if args.cache_text_encoder_outputs:
diff --git a/train_db.py b/train_db.py
index 888cad25e..d5f47a179 100644
--- a/train_db.py
+++ b/train_db.py
@@ -184,7 +184,7 @@ def train(args):
         batch_size=1,
         shuffle=True,
         collate_fn=collator,
-        num_workers=n_workers,
+        num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
         persistent_workers=args.persistent_data_loader_workers,
     )
 
@@ -214,15 +214,36 @@ def train(args):
         text_encoder.to(weight_dtype)
 
     # acceleratorがなんかよろしくやってくれるらしい
-    if train_text_encoder:
-        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
-        )
+    if args.deepspeed:
+        # wrapping model
+        class DeepSpeedModel(torch.nn.Module): 
+            def __init__(self, unet, text_encoder, vae) -> None:
+                super().__init__()
+                self.unet = unet
+                self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
+                self.vae = vae
+                
+            def get_models(self):
+                return self.unet, self.text_encoders, self.vae
+
+        unet.to(accelerator.device, dtype=weight_dtype)
+        [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders]
+        ds_model = DeepSpeedModel(unet, text_encoders, vae)
+        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
+        # Now, ds_model is an instance of DeepSpeedEngine. 
+        unet, text_encoders, vae = ds_model.get_models() # for compatiblility
+        vae.to(vae_dtype) # to avoid explicitly half-vae
+        text_encoder = text_encoders
     else:
-        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
+        if train_text_encoder:
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+            )
+        else:
+            unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
 
-    if not train_text_encoder:
-        text_encoder.to(accelerator.device, dtype=weight_dtype)  # to avoid 'cpu' vs 'cuda' error
+        if not train_text_encoder:
+            text_encoder.to(accelerator.device, dtype=weight_dtype)  # to avoid 'cpu' vs 'cuda' error
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
     if args.full_fp16:
diff --git a/train_network.py b/train_network.py
index 8d102ae8f..05dbe2de7 100644
--- a/train_network.py
+++ b/train_network.py
@@ -353,18 +353,26 @@ def train(self, args):
             batch_size=1,
             shuffle=True,
             collate_fn=collator,
-            num_workers=n_workers,
+            num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
             persistent_workers=args.persistent_data_loader_workers,
         )
 
         # 学習ステップ数を計算する
         if args.max_train_epochs is not None:
-            args.max_train_steps = args.max_train_epochs * math.ceil(
-                len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
-            )
-            accelerator.print(
-                f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
-            )
+            if args.deepspeed:
+                args.max_train_steps = args.max_train_epochs * math.ceil(
+                    len(train_dataloader) / args.gradient_accumulation_steps
+                )
+                accelerator.print(
+                    f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+                )
+            else:
+                args.max_train_steps = args.max_train_epochs * math.ceil(
+                    len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+                )
+                accelerator.print(
+                    f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+                )
 
         # データセット側にも学習ステップを送信
         train_dataset_group.set_max_train_steps(args.max_train_steps)
@@ -409,20 +417,42 @@ def train(self, args):
                 t_enc.text_model.embeddings.to(dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype))
 
         # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good
-        if train_unet:
-            unet = accelerator.prepare(unet)
+        if args.deepspeed:
+            # wrapping model
+            class DeepSpeedModel(torch.nn.Module): 
+                def __init__(self, unet, text_encoder, vae, network) -> None:
+                    super().__init__()
+                    self.unet = unet
+                    self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
+                    self.vae = vae
+                    self.network = network
+                    
+                def get_models(self):
+                    return self.unet, self.text_encoders, self.vae, self.network
+            
+            unet.to(accelerator.device, dtype=unet_weight_dtype)
+            [t_enc.to(accelerator.device, dtype=te_weight_dtype) for t_enc in text_encoders]
+            ds_model = DeepSpeedModel(unet, text_encoders, vae, network)
+            ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
+            # Now, ds_model is an instance of DeepSpeedEngine. 
+            unet, text_encoders, vae, network = ds_model.get_models() # for compatiblility
+            vae.to(vae_dtype) # to avoid explicitly half-vae
+            text_encoder = text_encoders
         else:
-            unet.to(accelerator.device, dtype=unet_weight_dtype)  # move to device because unet is not prepared by accelerator
-        if train_text_encoder:
-            if len(text_encoders) > 1:
-                text_encoder = text_encoders = [accelerator.prepare(t_enc) for t_enc in text_encoders]
+            if train_unet:
+                unet = accelerator.prepare(unet)
             else:
-                text_encoder = accelerator.prepare(text_encoder)
-                text_encoders = [text_encoder]
-        else:
-            pass  # if text_encoder is not trained, no need to prepare. and device and dtype are already set
+                unet.to(accelerator.device, dtype=unet_weight_dtype)  # move to device because unet is not prepared by accelerator
+            if train_text_encoder:
+                if len(text_encoders) > 1:
+                    text_encoder = text_encoders = [accelerator.prepare(t_enc) for t_enc in text_encoders]
+                else:
+                    text_encoder = accelerator.prepare(text_encoder)
+                    text_encoders = [text_encoder]
+            else:
+                pass  # if text_encoder is not trained, no need to prepare. and device and dtype are already set
 
-        network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler)
+            network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler)
 
         if args.gradient_checkpointing:
             # according to TI example in Diffusers, train is required

From 64873c1b4317afad99a1d397454ba0c64c6cb0b1 Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <hard2251@yonsei.ac.kr>
Date: Mon, 5 Feb 2024 17:11:50 +0900
Subject: [PATCH 002/132] fix offload_optimizer_device typo

---
 library/train_util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 2d85c9776..933a34c48 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3134,7 +3134,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         help="Possible options are 0,1,2,3."
     )
     parser.add_argument(
-        "--offload_optimizer", 
+        "--offload_optimizer_device", 
         type=str, default=None,
         choices=[None, "cpu", "nvme"],
         help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3."
@@ -3959,7 +3959,7 @@ def prepare_accelerator(args: argparse.Namespace):
         deepspeed_plugin = DeepSpeedPlugin(
             zero_stage=args.zero_stage,
             gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_clipping=args.max_grad_norm,
-            offload_optimizer=args.offload_optimizer, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path,
+            offload_optimizer_device=args.offload_optimizer_device, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path,
             offload_param_device=args.offload_param_device, offload_param_nvme_path=args.offload_param_nvme_path,
             zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model,
         )

From 2824312d5eb6df118d7585cde7e84d4cdae6f6c6 Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <hard2251@yonsei.ac.kr>
Date: Mon, 5 Feb 2024 20:13:28 +0900
Subject: [PATCH 003/132] fix vae type error during training sdxl

---
 library/sdxl_train_util.py |  1 -
 library/train_util.py      |  5 -----
 sdxl_train.py              | 25 +++++++++++--------------
 3 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/library/sdxl_train_util.py b/library/sdxl_train_util.py
index 5ad748d15..ff7fef176 100644
--- a/library/sdxl_train_util.py
+++ b/library/sdxl_train_util.py
@@ -17,7 +17,6 @@
 
 
 def load_target_model(args, accelerator, model_version: str, weight_dtype):
-    # load models for each process
     model_dtype = match_mixed_precision(args, weight_dtype)  # prepare fp16/bf16
     for pi in range(accelerator.state.num_processes):
         if pi == accelerator.state.local_process_index:
diff --git a/library/train_util.py b/library/train_util.py
index 933a34c48..a20edbe15 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -4042,28 +4042,23 @@ def _load_target_model(args: argparse.Namespace, weight_dtype, device="cpu", une
 
 
 def load_target_model(args, weight_dtype, accelerator, unet_use_linear_projection_in_v2=False):
-    # load models for each process
     for pi in range(accelerator.state.num_processes):
         if pi == accelerator.state.local_process_index:
             print(f"loading model for process {accelerator.state.local_process_index}/{accelerator.state.num_processes}")
-
             text_encoder, vae, unet, load_stable_diffusion_format = _load_target_model(
                 args,
                 weight_dtype,
                 accelerator.device if args.lowram else "cpu",
                 unet_use_linear_projection_in_v2=unet_use_linear_projection_in_v2,
             )
-
             # work on low-ram device
             if args.lowram:
                 text_encoder.to(accelerator.device)
                 unet.to(accelerator.device)
                 vae.to(accelerator.device)
-
             gc.collect()
             torch.cuda.empty_cache()
         accelerator.wait_for_everyone()
-
     return text_encoder, vae, unet, load_stable_diffusion_format
 
 
diff --git a/sdxl_train.py b/sdxl_train.py
index 6ce6c201e..e8680828b 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -392,23 +392,20 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     if args.deepspeed:
         # Wrapping model for DeepSpeed
         class DeepSpeedModel(torch.nn.Module): 
-            def __init__(self, unet, text_encoder, vae) -> None:
+            def __init__(self, unet, text_encoder) -> None:
                 super().__init__()
                 self.unet = unet
                 self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
-                self.vae = vae
                 
             def get_models(self):
-                return self.unet, self.text_encoders, self.vae
+                return self.unet, self.text_encoders
         text_encoders = [text_encoder1, text_encoder2]
-        unet.to(accelerator.device, dtype=weight_dtype)
-        [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders]
-        ds_model = DeepSpeedModel(unet, text_encoders, vae)
+        ds_model = DeepSpeedModel(unet, text_encoders)
         ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
         # Now, ds_model is an instance of DeepSpeedEngine. 
-        unet, text_encoders, vae = ds_model.get_models() # for compatiblility
-        vae.to(vae_dtype) # to avoid explicitly half-vae
-        text_encoder1, text_encoder2 = text_encoders[0], text_encoders[1]
+        unet, text_encoders = ds_model.get_models() # for compatiblility
+        text_encoder1, text_encoder2 = text_encoder = text_encoders
+        training_models = [unet, text_encoder1, text_encoder2]
     else: # acceleratorがなんかよろしくやってくれるらしい
         if train_unet:
             unet = accelerator.prepare(unet)
@@ -493,10 +490,10 @@ def get_models(self):
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
             with accelerator.accumulate(*training_models):
-                if "latents" in batch and batch["latents"] is not None:
-                    latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
-                else:
-                    with torch.no_grad():
+                with torch.no_grad(): # why this block differ within train_network.py?
+                    if "latents" in batch and batch["latents"] is not None:
+                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
+                    else:
                         # latentに変換
                         latents = vae.encode(batch["images"].to(vae_dtype)).latent_dist.sample().to(weight_dtype)
 
@@ -504,7 +501,7 @@ def get_models(self):
                         if torch.any(torch.isnan(latents)):
                             accelerator.print("NaN found in latents, replacing with zeros")
                             latents = torch.nan_to_num(latents, 0, out=latents)
-                latents = latents * sdxl_model_util.VAE_SCALE_FACTOR
+                    latents = latents * sdxl_model_util.VAE_SCALE_FACTOR
 
                 if "text_encoder_outputs1_list" not in batch or batch["text_encoder_outputs1_list"] is None:
                     input_ids1 = batch["input_ids"]

From 4295f91dcd75a7405aa70d5c5d2c826a618a4bcc Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <hard2251@yonsei.ac.kr>
Date: Mon, 5 Feb 2024 20:19:56 +0900
Subject: [PATCH 004/132] fix all trainer about vae

---
 fine_tune.py     | 29 ++++++++++++++++-------------
 train_db.py      | 29 ++++++++++++++++-------------
 train_network.py | 15 +++++----------
 3 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 78dfd1696..f901ee641 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -221,10 +221,18 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     # 学習ステップ数を計算する
     if args.max_train_epochs is not None:
-        args.max_train_steps = args.max_train_epochs * math.ceil(
-            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
-        )
-        accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
+        if args.deepspeed:
+            args.max_train_steps = args.max_train_epochs * math.ceil(
+                len(train_dataloader) / args.gradient_accumulation_steps
+            )
+            accelerator.print(
+                f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+            )
+        else:
+            args.max_train_steps = args.max_train_epochs * math.ceil(
+                len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+            )
+            accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
 
     # データセット側にも学習ステップを送信
     train_dataset_group.set_max_train_steps(args.max_train_steps)
@@ -244,21 +252,16 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     if args.deepspeed:
         # wrapping model
         class DeepSpeedModel(torch.nn.Module): 
-            def __init__(self, unet, text_encoder, vae) -> None:
+            def __init__(self, unet, text_encoder) -> None:
                 super().__init__()
                 self.unet = unet
                 self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
-                self.vae = vae
             def get_models(self):
-                return self.unet, self.text_encoders, self.vae
-            
-        unet.to(accelerator.device, dtype=weight_dtype)
-        [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders]
-        ds_model = DeepSpeedModel(unet, text_encoders, vae)
+                return self.unet, self.text_encoders
+        ds_model = DeepSpeedModel(unet, text_encoders)
         ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
         # Now, ds_model is an instance of DeepSpeedEngine. 
-        unet, text_encoders, vae = ds_model.get_models() # for compatiblility
-        vae.to(vae_dtype)
+        unet, text_encoders = ds_model.get_models() # for compatiblility
         text_encoder = text_encoders
             
     else: # acceleratorがなんかよろしくやってくれるらしい
diff --git a/train_db.py b/train_db.py
index d5f47a179..fa7f6a8dc 100644
--- a/train_db.py
+++ b/train_db.py
@@ -190,10 +190,18 @@ def train(args):
 
     # 学習ステップ数を計算する
     if args.max_train_epochs is not None:
-        args.max_train_steps = args.max_train_epochs * math.ceil(
-            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
-        )
-        accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
+        if args.deepspeed:
+            args.max_train_steps = args.max_train_epochs * math.ceil(
+                len(train_dataloader) / args.gradient_accumulation_steps
+            )
+            accelerator.print(
+                f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+            )
+        else:
+            args.max_train_steps = args.max_train_epochs * math.ceil(
+                len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+            )
+            accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
 
     # データセット側にも学習ステップを送信
     train_dataset_group.set_max_train_steps(args.max_train_steps)
@@ -217,22 +225,17 @@ def train(args):
     if args.deepspeed:
         # wrapping model
         class DeepSpeedModel(torch.nn.Module): 
-            def __init__(self, unet, text_encoder, vae) -> None:
+            def __init__(self, unet, text_encoder) -> None:
                 super().__init__()
                 self.unet = unet
                 self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
-                self.vae = vae
                 
             def get_models(self):
-                return self.unet, self.text_encoders, self.vae
-
-        unet.to(accelerator.device, dtype=weight_dtype)
-        [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders]
-        ds_model = DeepSpeedModel(unet, text_encoders, vae)
+                return self.unet, self.text_encoders
+        ds_model = DeepSpeedModel(unet, text_encoders)
         ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
         # Now, ds_model is an instance of DeepSpeedEngine. 
-        unet, text_encoders, vae = ds_model.get_models() # for compatiblility
-        vae.to(vae_dtype) # to avoid explicitly half-vae
+        unet, text_encoders = ds_model.get_models() # for compatiblility
         text_encoder = text_encoders
     else:
         if train_text_encoder:
diff --git a/train_network.py b/train_network.py
index 05dbe2de7..bbda427aa 100644
--- a/train_network.py
+++ b/train_network.py
@@ -364,7 +364,7 @@ def train(self, args):
                     len(train_dataloader) / args.gradient_accumulation_steps
                 )
                 accelerator.print(
-                    f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+                    f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
                 )
             else:
                 args.max_train_steps = args.max_train_epochs * math.ceil(
@@ -420,23 +420,18 @@ def train(self, args):
         if args.deepspeed:
             # wrapping model
             class DeepSpeedModel(torch.nn.Module): 
-                def __init__(self, unet, text_encoder, vae, network) -> None:
+                def __init__(self, unet, text_encoder, network) -> None:
                     super().__init__()
                     self.unet = unet
                     self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
-                    self.vae = vae
                     self.network = network
                     
                 def get_models(self):
-                    return self.unet, self.text_encoders, self.vae, self.network
-            
-            unet.to(accelerator.device, dtype=unet_weight_dtype)
-            [t_enc.to(accelerator.device, dtype=te_weight_dtype) for t_enc in text_encoders]
-            ds_model = DeepSpeedModel(unet, text_encoders, vae, network)
+                    return self.unet, self.text_encoders, self.network
+            ds_model = DeepSpeedModel(unet, text_encoders, network)
             ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
             # Now, ds_model is an instance of DeepSpeedEngine. 
-            unet, text_encoders, vae, network = ds_model.get_models() # for compatiblility
-            vae.to(vae_dtype) # to avoid explicitly half-vae
+            unet, text_encoders, network = ds_model.get_models() # for compatiblility
             text_encoder = text_encoders
         else:
             if train_unet:

From 3970bf40804d9c66e76e0af5e1d0477f19bfa79a Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <hard2251@yonsei.ac.kr>
Date: Mon, 5 Feb 2024 22:40:43 +0900
Subject: [PATCH 005/132] maybe fix branch to run offloading

---
 library/train_util.py | 2 ++
 sdxl_train.py         | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/library/train_util.py b/library/train_util.py
index a20edbe15..676652e90 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3964,6 +3964,8 @@ def prepare_accelerator(args: argparse.Namespace):
             zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model,
         )
         deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size
+        deepspeed_plugin.deepspeed_config['train_batch_size'] = \
+            args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE'])
 
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
diff --git a/sdxl_train.py b/sdxl_train.py
index e8680828b..ef3ead380 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -391,6 +391,12 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     if args.deepspeed:
         # Wrapping model for DeepSpeed
+        import deepspeed
+        if args.offload_optimizer_device is not None:
+            accelerator.print('[DeepSpeed] start to manually build cpu_adam.')
+            deepspeed.ops.op_builder.CPUAdamBuilder().load()
+            accelerator.print('[DeepSpeed] building cpu_adam done.')
+
         class DeepSpeedModel(torch.nn.Module): 
             def __init__(self, unet, text_encoder) -> None:
                 super().__init__()

From 7d2a9268b9d8d3c9b78068aaa2f9d43eb8b6101b Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <hard2251@yonsei.ac.kr>
Date: Mon, 5 Feb 2024 22:42:06 +0900
Subject: [PATCH 006/132] apply offloading method runable for all trainer

---
 fine_tune.py     | 5 +++++
 train_db.py      | 5 +++++
 train_network.py | 5 +++++
 3 files changed, 15 insertions(+)

diff --git a/fine_tune.py b/fine_tune.py
index f901ee641..85febeaaa 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -251,6 +251,11 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     if args.deepspeed:
         # wrapping model
+        import deepspeed
+        if args.offload_optimizer_device is not None:
+            accelerator.print('[DeepSpeed] start to manually build cpu_adam.')
+            deepspeed.ops.op_builder.CPUAdamBuilder().load()
+            accelerator.print('[DeepSpeed] building cpu_adam done.')
         class DeepSpeedModel(torch.nn.Module): 
             def __init__(self, unet, text_encoder) -> None:
                 super().__init__()
diff --git a/train_db.py b/train_db.py
index fa7f6a8dc..e26618867 100644
--- a/train_db.py
+++ b/train_db.py
@@ -224,6 +224,11 @@ def train(args):
     # acceleratorがなんかよろしくやってくれるらしい
     if args.deepspeed:
         # wrapping model
+        import deepspeed
+        if args.offload_optimizer_device is not None:
+            accelerator.print('[DeepSpeed] start to manually build cpu_adam.')
+            deepspeed.ops.op_builder.CPUAdamBuilder().load()
+            accelerator.print('[DeepSpeed] building cpu_adam done.')
         class DeepSpeedModel(torch.nn.Module): 
             def __init__(self, unet, text_encoder) -> None:
                 super().__init__()
diff --git a/train_network.py b/train_network.py
index bbda427aa..050a65111 100644
--- a/train_network.py
+++ b/train_network.py
@@ -419,6 +419,11 @@ def train(self, args):
         # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good
         if args.deepspeed:
             # wrapping model
+            import deepspeed
+            if args.offload_optimizer_device is not None:
+                accelerator.print('[DeepSpeed] start to manually build cpu_adam.')
+                deepspeed.ops.op_builder.CPUAdamBuilder().load()
+                accelerator.print('[DeepSpeed] building cpu_adam done.')
             class DeepSpeedModel(torch.nn.Module): 
                 def __init__(self, unet, text_encoder, network) -> None:
                     super().__init__()

From 62556619bdc876c450bfb1445b16683cf3a98699 Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <hard2251@yonsei.ac.kr>
Date: Wed, 7 Feb 2024 16:42:05 +0900
Subject: [PATCH 007/132] fix full_fp16 compatible and train_step

---
 fine_tune.py              | 16 ++-----
 library/train_util.py     |  7 +++
 sdxl_train.py             |  3 +-
 test_pip_requirements.txt | 96 +++++++++++++++++++++++++++++++++++++++
 train_db.py               | 16 ++-----
 train_network.py          | 23 ++++------
 6 files changed, 121 insertions(+), 40 deletions(-)
 create mode 100644 test_pip_requirements.txt

diff --git a/fine_tune.py b/fine_tune.py
index 85febeaaa..eb652742c 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -221,18 +221,10 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     # 学習ステップ数を計算する
     if args.max_train_epochs is not None:
-        if args.deepspeed:
-            args.max_train_steps = args.max_train_epochs * math.ceil(
-                len(train_dataloader) / args.gradient_accumulation_steps
-            )
-            accelerator.print(
-                f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
-            )
-        else:
-            args.max_train_steps = args.max_train_epochs * math.ceil(
-                len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
-            )
-            accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
+        args.max_train_steps = args.max_train_epochs * math.ceil(
+            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+        )
+        accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
 
     # データセット側にも学習ステップを送信
     train_dataset_group.set_max_train_steps(args.max_train_steps)
diff --git a/library/train_util.py b/library/train_util.py
index 676652e90..ea6265109 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3166,6 +3166,11 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         action="store_true",
         help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3."
     )
+    parser.add_argument(
+        "--fp16_master_weights_and_gradients",
+        action="store_true",
+        help="fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32."
+    )
 
 def verify_training_args(args: argparse.Namespace):
     if args.v_parameterization and not args.v2:
@@ -3966,6 +3971,8 @@ def prepare_accelerator(args: argparse.Namespace):
         deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size
         deepspeed_plugin.deepspeed_config['train_batch_size'] = \
             args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE'])
+        if args.full_fp16 or args.fp16_master_weights_and_gradients:
+            deepspeed_plugin.deepspeed_config['fp16_master_weights_and_gradients'] = True
 
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
diff --git a/sdxl_train.py b/sdxl_train.py
index ef3ead380..54902b873 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -437,7 +437,8 @@ def get_models(self):
         text_encoder2.to(accelerator.device)
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
-    if args.full_fp16:
+    if args.full_fp16 and not args.deepspeed:
+        # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
         train_util.patch_accelerator_for_fp16_training(accelerator)
 
     # resumeする
diff --git a/test_pip_requirements.txt b/test_pip_requirements.txt
new file mode 100644
index 000000000..6abec3516
--- /dev/null
+++ b/test_pip_requirements.txt
@@ -0,0 +1,96 @@
+absl-py==2.1.0
+accelerate==0.25.0
+aiohttp==3.9.3
+aiosignal==1.3.1
+altair==4.2.2
+annotated-types @ file:///home/conda/feedstock_root/build_artifacts/annotated-types_1696634205638/work
+async-timeout==4.0.3
+attrs==23.2.0
+bitsandbytes==0.42.0
+Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1695989787169/work
+cachetools==5.3.2
+certifi==2022.12.7
+charset-normalizer==2.1.1
+cmake==3.25.0
+deepspeed==0.13.1
+diffusers==0.25.0
+easygui==0.98.3
+einops==0.6.1
+entrypoints==0.4
+filelock==3.9.0
+frozenlist==1.4.1
+fsspec==2024.2.0
+ftfy==6.1.1
+gmpy2 @ file:///home/conda/feedstock_root/build_artifacts/gmpy2_1666808654411/work
+google-auth==2.27.0
+google-auth-oauthlib==0.4.6
+grpcio==1.60.1
+hjson==3.1.0
+huggingface-hub==0.20.1
+idna==3.4
+importlib-metadata==7.0.1
+Jinja2==3.1.2
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+-e git+https://github.com/kohya-ss/sd-scripts@cd19df49cd512e13ac90db115c424d19c0e8868a#egg=library
+lightning-utilities==0.10.1
+lit==15.0.7
+Markdown==3.5.2
+MarkupSafe==2.1.3
+mpmath==1.3.0
+multidict==6.0.5
+networkx==3.2.1
+ninja==1.11.1.1
+numpy==1.26.3
+oauthlib==3.2.2
+open-clip-torch==2.20.0
+opencv-python==4.7.0.68
+packaging==23.2
+pandas==2.2.0
+pillow==10.2.0
+protobuf==3.19.6
+psutil==5.9.8
+py-cpuinfo @ file:///home/conda/feedstock_root/build_artifacts/py-cpuinfo_1666774466606/work
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pydantic @ file:///home/conda/feedstock_root/build_artifacts/pydantic_1706543943340/work
+pydantic_core @ file:///home/conda/feedstock_root/build_artifacts/pydantic-core_1705674688239/work
+pynvml==11.5.0
+PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1661604839144/work
+python-dateutil==2.8.2
+pytorch-lightning==1.9.0
+pytz==2024.1
+PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1695373428874/work
+referencing==0.33.0
+regex==2023.12.25
+requests==2.28.1
+requests-oauthlib==1.3.1
+rpds-py==0.17.1
+rsa==4.9
+safetensors==0.4.2
+scipy==1.12.0
+sentencepiece==0.1.99
+six==1.16.0
+sympy==1.12
+tensorboard==2.10.1
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+timm==0.9.12
+tokenizers==0.15.1
+toml==0.10.2
+toolz==0.12.1
+torch==2.0.1+cu118
+torchaudio==2.2.0
+torchmetrics==1.3.0.post0
+torchvision==0.15.2+cu118
+tqdm==4.66.1
+transformers==4.36.2
+triton==2.0.0
+typing_extensions==4.8.0
+tzdata==2023.4
+urllib3==1.26.13
+voluptuous==0.13.1
+wcwidth==0.2.13
+Werkzeug==3.0.1
+yarl==1.9.4
+zipp==3.17.0
diff --git a/train_db.py b/train_db.py
index e26618867..58536555e 100644
--- a/train_db.py
+++ b/train_db.py
@@ -190,18 +190,10 @@ def train(args):
 
     # 学習ステップ数を計算する
     if args.max_train_epochs is not None:
-        if args.deepspeed:
-            args.max_train_steps = args.max_train_epochs * math.ceil(
-                len(train_dataloader) / args.gradient_accumulation_steps
-            )
-            accelerator.print(
-                f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
-            )
-        else:
-            args.max_train_steps = args.max_train_epochs * math.ceil(
-                len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
-            )
-            accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
+        args.max_train_steps = args.max_train_epochs * math.ceil(
+            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+        )
+        accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
 
     # データセット側にも学習ステップを送信
     train_dataset_group.set_max_train_steps(args.max_train_steps)
diff --git a/train_network.py b/train_network.py
index 050a65111..cc445d39a 100644
--- a/train_network.py
+++ b/train_network.py
@@ -359,20 +359,12 @@ def train(self, args):
 
         # 学習ステップ数を計算する
         if args.max_train_epochs is not None:
-            if args.deepspeed:
-                args.max_train_steps = args.max_train_epochs * math.ceil(
-                    len(train_dataloader) / args.gradient_accumulation_steps
-                )
-                accelerator.print(
-                    f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
-                )
-            else:
-                args.max_train_steps = args.max_train_epochs * math.ceil(
-                    len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
-                )
-                accelerator.print(
-                    f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
-                )
+            args.max_train_steps = args.max_train_epochs * math.ceil(
+                len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+            )
+            accelerator.print(
+                f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+            )
 
         # データセット側にも学習ステップを送信
         train_dataset_group.set_max_train_steps(args.max_train_steps)
@@ -479,7 +471,8 @@ def get_models(self):
             vae.to(accelerator.device, dtype=vae_dtype)
 
         # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
-        if args.full_fp16:
+        if args.full_fp16 and not args.deepspeed:
+            # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
             train_util.patch_accelerator_for_fp16_training(accelerator)
 
         # resumeする

From 2445a5b74e4c5bb0af24e0b3162c1eaef218b56b Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <hard2251@yonsei.ac.kr>
Date: Wed, 7 Feb 2024 16:48:18 +0900
Subject: [PATCH 008/132] remove test requirements

---
 test_pip_requirements.txt | 96 ---------------------------------------
 1 file changed, 96 deletions(-)
 delete mode 100644 test_pip_requirements.txt

diff --git a/test_pip_requirements.txt b/test_pip_requirements.txt
deleted file mode 100644
index 6abec3516..000000000
--- a/test_pip_requirements.txt
+++ /dev/null
@@ -1,96 +0,0 @@
-absl-py==2.1.0
-accelerate==0.25.0
-aiohttp==3.9.3
-aiosignal==1.3.1
-altair==4.2.2
-annotated-types @ file:///home/conda/feedstock_root/build_artifacts/annotated-types_1696634205638/work
-async-timeout==4.0.3
-attrs==23.2.0
-bitsandbytes==0.42.0
-Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1695989787169/work
-cachetools==5.3.2
-certifi==2022.12.7
-charset-normalizer==2.1.1
-cmake==3.25.0
-deepspeed==0.13.1
-diffusers==0.25.0
-easygui==0.98.3
-einops==0.6.1
-entrypoints==0.4
-filelock==3.9.0
-frozenlist==1.4.1
-fsspec==2024.2.0
-ftfy==6.1.1
-gmpy2 @ file:///home/conda/feedstock_root/build_artifacts/gmpy2_1666808654411/work
-google-auth==2.27.0
-google-auth-oauthlib==0.4.6
-grpcio==1.60.1
-hjson==3.1.0
-huggingface-hub==0.20.1
-idna==3.4
-importlib-metadata==7.0.1
-Jinja2==3.1.2
-jsonschema==4.21.1
-jsonschema-specifications==2023.12.1
--e git+https://github.com/kohya-ss/sd-scripts@cd19df49cd512e13ac90db115c424d19c0e8868a#egg=library
-lightning-utilities==0.10.1
-lit==15.0.7
-Markdown==3.5.2
-MarkupSafe==2.1.3
-mpmath==1.3.0
-multidict==6.0.5
-networkx==3.2.1
-ninja==1.11.1.1
-numpy==1.26.3
-oauthlib==3.2.2
-open-clip-torch==2.20.0
-opencv-python==4.7.0.68
-packaging==23.2
-pandas==2.2.0
-pillow==10.2.0
-protobuf==3.19.6
-psutil==5.9.8
-py-cpuinfo @ file:///home/conda/feedstock_root/build_artifacts/py-cpuinfo_1666774466606/work
-pyasn1==0.5.1
-pyasn1-modules==0.3.0
-pydantic @ file:///home/conda/feedstock_root/build_artifacts/pydantic_1706543943340/work
-pydantic_core @ file:///home/conda/feedstock_root/build_artifacts/pydantic-core_1705674688239/work
-pynvml==11.5.0
-PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1661604839144/work
-python-dateutil==2.8.2
-pytorch-lightning==1.9.0
-pytz==2024.1
-PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1695373428874/work
-referencing==0.33.0
-regex==2023.12.25
-requests==2.28.1
-requests-oauthlib==1.3.1
-rpds-py==0.17.1
-rsa==4.9
-safetensors==0.4.2
-scipy==1.12.0
-sentencepiece==0.1.99
-six==1.16.0
-sympy==1.12
-tensorboard==2.10.1
-tensorboard-data-server==0.6.1
-tensorboard-plugin-wit==1.8.1
-timm==0.9.12
-tokenizers==0.15.1
-toml==0.10.2
-toolz==0.12.1
-torch==2.0.1+cu118
-torchaudio==2.2.0
-torchmetrics==1.3.0.post0
-torchvision==0.15.2+cu118
-tqdm==4.66.1
-transformers==4.36.2
-triton==2.0.0
-typing_extensions==4.8.0
-tzdata==2023.4
-urllib3==1.26.13
-voluptuous==0.13.1
-wcwidth==0.2.13
-Werkzeug==3.0.1
-yarl==1.9.4
-zipp==3.17.0

From a98fecaeb1e818c778c90fe441a71a8bd34615ff Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <hard2251@yonsei.ac.kr>
Date: Wed, 7 Feb 2024 17:19:46 +0900
Subject: [PATCH 009/132] forgot setting mixed_precision for deepspeed. sorry

---
 library/train_util.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/library/train_util.py b/library/train_util.py
index ea6265109..dbe5a61ce 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3971,6 +3971,9 @@ def prepare_accelerator(args: argparse.Namespace):
         deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size
         deepspeed_plugin.deepspeed_config['train_batch_size'] = \
             args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE'])
+        deepspeed_plugin.set_mixed_precision(args.mixed_precision)
+        if args.mixed_precision.lower() == "fp16":
+            deepspeed_plugin.deepspeed_config['fp16']['initial_scale_power'] = 0
         if args.full_fp16 or args.fp16_master_weights_and_gradients:
             deepspeed_plugin.deepspeed_config['fp16_master_weights_and_gradients'] = True
 

From 03f0816f86b2d4d8915d81146242fb6f7f99c5ff Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <hard2251@yonsei.ac.kr>
Date: Fri, 9 Feb 2024 17:47:49 +0900
Subject: [PATCH 010/132] the reason not working grad accum steps found. it was
 becasue of my accelerate settings

---
 fine_tune.py          | 5 +++--
 library/train_util.py | 6 +++++-
 sdxl_train.py         | 5 +++--
 train_db.py           | 5 +++--
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index eb652742c..741e9c857 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -224,8 +224,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         args.max_train_steps = args.max_train_epochs * math.ceil(
             len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
         )
-        accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
-
+        accelerator.print(
+            f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+        )
     # データセット側にも学習ステップを送信
     train_dataset_group.set_max_train_steps(args.max_train_steps)
 
diff --git a/library/train_util.py b/library/train_util.py
index dbe5a61ce..61c836247 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3975,7 +3975,11 @@ def prepare_accelerator(args: argparse.Namespace):
         if args.mixed_precision.lower() == "fp16":
             deepspeed_plugin.deepspeed_config['fp16']['initial_scale_power'] = 0
         if args.full_fp16 or args.fp16_master_weights_and_gradients:
-            deepspeed_plugin.deepspeed_config['fp16_master_weights_and_gradients'] = True
+            if args.offload_optimizer_device == "cpu":
+                deepspeed_plugin.deepspeed_config['fp16']['fp16_master_weights_and_grads'] = True
+                print("[DeepSpeed] full fp16 enable.")
+            else:
+                print("full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam.")
 
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
diff --git a/sdxl_train.py b/sdxl_train.py
index 54902b873..6ffb1bbaf 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -363,8 +363,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         args.max_train_steps = args.max_train_epochs * math.ceil(
             len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
         )
-        accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
-
+        accelerator.print(
+            f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+        )
     # データセット側にも学習ステップを送信
     train_dataset_group.set_max_train_steps(args.max_train_steps)
 
diff --git a/train_db.py b/train_db.py
index 58536555e..c336a1c1c 100644
--- a/train_db.py
+++ b/train_db.py
@@ -193,8 +193,9 @@ def train(args):
         args.max_train_steps = args.max_train_epochs * math.ceil(
             len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
         )
-        accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
-
+        accelerator.print(
+            f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
+        )
     # データセット側にも学習ステップを送信
     train_dataset_group.set_max_train_steps(args.max_train_steps)
 

From 4d5186d1cf0b0fbda20513def793ac3f5e9d5ea0 Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <hard2251@yonsei.ac.kr>
Date: Thu, 22 Feb 2024 16:20:53 +0900
Subject: [PATCH 011/132] refactored codes, some function moved into
 train_utils.py

---
 fine_tune.py          | 29 +++++++---------
 library/train_util.py | 78 +++++++++++++++++++++++++++++++------------
 sdxl_train.py         | 43 ++++++++++++------------
 train_db.py           | 31 ++++++++---------
 train_network.py      | 34 +++++++++----------
 5 files changed, 119 insertions(+), 96 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 741e9c857..862607545 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -243,24 +243,19 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         text_encoder.to(weight_dtype)
 
     if args.deepspeed:
-        # wrapping model
-        import deepspeed
-        if args.offload_optimizer_device is not None:
-            accelerator.print('[DeepSpeed] start to manually build cpu_adam.')
-            deepspeed.ops.op_builder.CPUAdamBuilder().load()
-            accelerator.print('[DeepSpeed] building cpu_adam done.')
-        class DeepSpeedModel(torch.nn.Module): 
-            def __init__(self, unet, text_encoder) -> None:
-                super().__init__()
-                self.unet = unet
-                self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
-            def get_models(self):
-                return self.unet, self.text_encoders
-        ds_model = DeepSpeedModel(unet, text_encoders)
+        training_models_dict = {}
+        training_models_dict["unet"] = unet
+        if args.train_text_encoder: training_models_dict["text_encoder"] = text_encoder
+
+        ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict)
         ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
-        # Now, ds_model is an instance of DeepSpeedEngine. 
-        unet, text_encoders = ds_model.get_models() # for compatiblility
-        text_encoder = text_encoders
+    
+        training_models = []
+        unet = ds_model.models["unet"]
+        training_models.append(unet)
+        if args.train_text_encoder:
+            text_encoder = ds_model.models["text_encoder"]
+            training_models.append(text_encoder)
             
     else: # acceleratorがなんかよろしくやってくれるらしい
         if args.train_text_encoder:
diff --git a/library/train_util.py b/library/train_util.py
index 61c836247..334aaa21e 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3959,27 +3959,7 @@ def prepare_accelerator(args: argparse.Namespace):
         else None,
     )
     kwargs_handlers = list(filter(lambda x: x is not None, kwargs_handlers))
-    deepspeed_plugin = None
-    if args.deepspeed:
-        deepspeed_plugin = DeepSpeedPlugin(
-            zero_stage=args.zero_stage,
-            gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_clipping=args.max_grad_norm,
-            offload_optimizer_device=args.offload_optimizer_device, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path,
-            offload_param_device=args.offload_param_device, offload_param_nvme_path=args.offload_param_nvme_path,
-            zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model,
-        )
-        deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size
-        deepspeed_plugin.deepspeed_config['train_batch_size'] = \
-            args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE'])
-        deepspeed_plugin.set_mixed_precision(args.mixed_precision)
-        if args.mixed_precision.lower() == "fp16":
-            deepspeed_plugin.deepspeed_config['fp16']['initial_scale_power'] = 0
-        if args.full_fp16 or args.fp16_master_weights_and_gradients:
-            if args.offload_optimizer_device == "cpu":
-                deepspeed_plugin.deepspeed_config['fp16']['fp16_master_weights_and_grads'] = True
-                print("[DeepSpeed] full fp16 enable.")
-            else:
-                print("full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam.")
+    deepspeed_plugin = prepare_deepspeed_plugin(args)
 
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
@@ -3992,6 +3972,62 @@ def prepare_accelerator(args: argparse.Namespace):
     )
     return accelerator
 
+def prepare_deepspeed_plugin(args: argparse.Namespace):
+    if args.deepspeed is None: return None
+    try:
+        import deepspeed
+    except ImportError as e:
+        print("deepspeed is not installed. please install deepspeed in your environment with following command. DS_BUILD_OPS=0 pip install deepspeed")
+        exit(1)
+
+    deepspeed_plugin = DeepSpeedPlugin(
+        zero_stage=args.zero_stage,
+        gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_clipping=args.max_grad_norm,
+        offload_optimizer_device=args.offload_optimizer_device, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path,
+        offload_param_device=args.offload_param_device, offload_param_nvme_path=args.offload_param_nvme_path,
+        zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model,
+    )
+    deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size
+    deepspeed_plugin.deepspeed_config['train_batch_size'] = \
+        args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE'])
+    deepspeed_plugin.set_mixed_precision(args.mixed_precision)
+    if args.mixed_precision.lower() == "fp16":
+        deepspeed_plugin.deepspeed_config['fp16']['initial_scale_power'] = 0 # preventing overflow.
+    if args.full_fp16 or args.fp16_master_weights_and_gradients:
+        if args.offload_optimizer_device == "cpu" and args.zero_stage == 2:
+            deepspeed_plugin.deepspeed_config['fp16']['fp16_master_weights_and_grads'] = True
+            print("[DeepSpeed] full fp16 enable.")
+        else:
+            print("[DeepSpeed]full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam on ZeRO-2 stage.")
+    
+    if args.offload_optimizer_device is not None:
+        print('[DeepSpeed] start to manually build cpu_adam.')
+        deepspeed.ops.op_builder.CPUAdamBuilder().load()
+        print('[DeepSpeed] building cpu_adam done.')
+
+    return deepspeed_plugin
+
+def prepare_deepspeed_model(args: argparse.Namespace, **models):
+    class DeepSpeedWrapper(torch.nn.Module):
+        def __init__(self, **kw_models) -> None:
+            super().__init__()
+            self.models = torch.nn.ModuleDict()
+
+            for key, model in kw_models.items():
+                if isinstance(model, list):
+                    model = torch.nn.ModuleList(model)
+                assert isinstance(model, torch.nn.Module), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}"
+                self.models.update(
+                    torch.nn.ModuleDict(
+                        {key: model}
+                    )
+                )
+
+        def get_models(self):
+            return self.models
+
+    ds_model = DeepSpeedWrapper(**models)
+    return ds_model
 
 def prepare_dtype(args: argparse.Namespace):
     weight_dtype = torch.float32
diff --git a/sdxl_train.py b/sdxl_train.py
index 6ffb1bbaf..2f1a5ce65 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -391,28 +391,29 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         text_encoder2.to(weight_dtype)
 
     if args.deepspeed:
-        # Wrapping model for DeepSpeed
-        import deepspeed
-        if args.offload_optimizer_device is not None:
-            accelerator.print('[DeepSpeed] start to manually build cpu_adam.')
-            deepspeed.ops.op_builder.CPUAdamBuilder().load()
-            accelerator.print('[DeepSpeed] building cpu_adam done.')
-
-        class DeepSpeedModel(torch.nn.Module): 
-            def __init__(self, unet, text_encoder) -> None:
-                super().__init__()
-                self.unet = unet
-                self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
-                
-            def get_models(self):
-                return self.unet, self.text_encoders
-        text_encoders = [text_encoder1, text_encoder2]
-        ds_model = DeepSpeedModel(unet, text_encoders)
+        training_models_dict = {}
+        if train_unet:
+            training_models_dict["unet"] = unet
+        if train_text_encoder1:
+            text_encoder1.text_model.encoder.layers[-1].requires_grad_(False)
+            text_encoder1.text_model.final_layer_norm.requires_grad_(False)
+            training_models_dict["text_encoder1"] = text_encoder1
+        if train_text_encoder2:
+            training_models_dict["text_encoder2"] = text_encoder2
+        ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict)
         ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
-        # Now, ds_model is an instance of DeepSpeedEngine. 
-        unet, text_encoders = ds_model.get_models() # for compatiblility
-        text_encoder1, text_encoder2 = text_encoder = text_encoders
-        training_models = [unet, text_encoder1, text_encoder2]
+        
+        training_models = [] # override training_models
+        if train_unet:
+            unet = ds_model.models["unet"]
+            training_models.append(unet)
+        if train_text_encoder1:
+            text_encoder1 = ds_model.models["text_encoder1"]
+            training_models.append(text_encoder1)
+        if train_text_encoder2:
+            text_encoder2 = ds_model.models["text_encoder2"]
+            training_models.append(text_encoder2)
+
     else: # acceleratorがなんかよろしくやってくれるらしい
         if train_unet:
             unet = accelerator.prepare(unet)
diff --git a/train_db.py b/train_db.py
index c336a1c1c..f188d7bd9 100644
--- a/train_db.py
+++ b/train_db.py
@@ -216,25 +216,20 @@ def train(args):
 
     # acceleratorがなんかよろしくやってくれるらしい
     if args.deepspeed:
-        # wrapping model
-        import deepspeed
-        if args.offload_optimizer_device is not None:
-            accelerator.print('[DeepSpeed] start to manually build cpu_adam.')
-            deepspeed.ops.op_builder.CPUAdamBuilder().load()
-            accelerator.print('[DeepSpeed] building cpu_adam done.')
-        class DeepSpeedModel(torch.nn.Module): 
-            def __init__(self, unet, text_encoder) -> None:
-                super().__init__()
-                self.unet = unet
-                self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
-                
-            def get_models(self):
-                return self.unet, self.text_encoders
-        ds_model = DeepSpeedModel(unet, text_encoders)
+        training_models_dict = {}
+        training_models_dict["unet"] = unet
+        if train_text_encoder: training_models_dict["text_encoder"] = text_encoder
+
+        ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict)
         ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
-        # Now, ds_model is an instance of DeepSpeedEngine. 
-        unet, text_encoders = ds_model.get_models() # for compatiblility
-        text_encoder = text_encoders
+    
+        training_models = []
+        unet = ds_model.models["unet"]
+        training_models.append(unet)
+        if train_text_encoder:
+            text_encoder = ds_model.models["text_encoder"]
+            training_models.append(text_encoder)
+            
     else:
         if train_text_encoder:
             unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
diff --git a/train_network.py b/train_network.py
index cc445d39a..dfa17eb32 100644
--- a/train_network.py
+++ b/train_network.py
@@ -410,26 +410,22 @@ def train(self, args):
 
         # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good
         if args.deepspeed:
-            # wrapping model
-            import deepspeed
-            if args.offload_optimizer_device is not None:
-                accelerator.print('[DeepSpeed] start to manually build cpu_adam.')
-                deepspeed.ops.op_builder.CPUAdamBuilder().load()
-                accelerator.print('[DeepSpeed] building cpu_adam done.')
-            class DeepSpeedModel(torch.nn.Module): 
-                def __init__(self, unet, text_encoder, network) -> None:
-                    super().__init__()
-                    self.unet = unet
-                    self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder)
-                    self.network = network
-                    
-                def get_models(self):
-                    return self.unet, self.text_encoders, self.network
-            ds_model = DeepSpeedModel(unet, text_encoders, network)
+            training_models_dict = {}
+            if train_unet: training_models_dict["unet"] = unet
+            if train_text_encoder: training_models_dict["text_encoder"] = text_encoders
+            training_models_dict["network"] = network
+
+            ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict)
             ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
-            # Now, ds_model is an instance of DeepSpeedEngine. 
-            unet, text_encoders, network = ds_model.get_models() # for compatiblility
-            text_encoder = text_encoders
+            
+            if train_unet: unet = ds_model.models["unet"]
+            if train_text_encoder:
+                text_encoder = ds_model.models["text_encoder"]
+                if len(ds_model.models["text_encoder"]) > 1:
+                    text_encoders = text_encoder
+                else:
+                    text_encoders = [text_encoder]
+
         else:
             if train_unet:
                 unet = accelerator.prepare(unet)

From f2c727fc8cadf0971c24fdb42c8684032e7e6f80 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 26 Feb 2024 23:19:58 +0900
Subject: [PATCH 012/132] add minimal impl  for masked loss

---
 library/config_util.py | 38 +++++++++++++++++++++++++-------------
 library/train_util.py  |  3 +++
 train_network.py       | 18 +++++++++++++++++-
 3 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index eb652ecf3..edc6a5385 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -41,12 +41,17 @@
     DatasetGroup,
 )
 from .utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 def add_config_arguments(parser: argparse.ArgumentParser):
-    parser.add_argument("--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル")
+    parser.add_argument(
+        "--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル"
+    )
 
 
 # TODO: inherit Params class in Subset, Dataset
@@ -248,9 +253,10 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
     }
 
     def __init__(self, support_dreambooth: bool, support_finetuning: bool, support_controlnet: bool, support_dropout: bool) -> None:
-        assert (
-            support_dreambooth or support_finetuning or support_controlnet
-        ), "Neither DreamBooth mode nor fine tuning mode specified. Please specify one mode or more. / DreamBooth モードか fine tuning モードのどちらも指定されていません。1つ以上指定してください。"
+        assert support_dreambooth or support_finetuning or support_controlnet, (
+            "Neither DreamBooth mode nor fine tuning mode nor controlnet mode specified. Please specify one mode or more."
+            + " / DreamBooth モードか fine tuning モードか controlnet モードのどれも指定されていません。1つ以上指定してください。"
+        )
 
         self.db_subset_schema = self.__merge_dict(
             self.SUBSET_ASCENDABLE_SCHEMA,
@@ -362,7 +368,9 @@ def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) ->
             return self.argparse_config_validator(argparse_namespace)
         except MultipleInvalid:
             # XXX: this should be a bug
-            logger.error("Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。")
+            logger.error(
+                "Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。"
+            )
             raise
 
     # NOTE: value would be overwritten by latter dict if there is already the same key
@@ -547,11 +555,11 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
                     "    ",
                 )
 
-    logger.info(f'{info}')
+    logger.info(f"{info}")
 
     # make buckets first because it determines the length of dataset
     # and set the same seed for all datasets
-    seed = random.randint(0, 2**31) # actual seed is seed + epoch_no
+    seed = random.randint(0, 2**31)  # actual seed is seed + epoch_no
     for i, dataset in enumerate(datasets):
         logger.info(f"[Dataset {i}]")
         dataset.make_buckets()
@@ -638,13 +646,17 @@ def load_user_config(file: str) -> dict:
             with open(file, "r") as f:
                 config = json.load(f)
         except Exception:
-            logger.error(f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}")
+            logger.error(
+                f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
             raise
     elif file.name.lower().endswith(".toml"):
         try:
             config = toml.load(file)
         except Exception:
-            logger.error(f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}")
+            logger.error(
+                f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
             raise
     else:
         raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file}")
@@ -671,13 +683,13 @@ def load_user_config(file: str) -> dict:
     train_util.prepare_dataset_args(argparse_namespace, config_args.support_finetuning)
 
     logger.info("[argparse_namespace]")
-    logger.info(f'{vars(argparse_namespace)}')
+    logger.info(f"{vars(argparse_namespace)}")
 
     user_config = load_user_config(config_args.dataset_config)
 
     logger.info("")
     logger.info("[user_config]")
-    logger.info(f'{user_config}')
+    logger.info(f"{user_config}")
 
     sanitizer = ConfigSanitizer(
         config_args.support_dreambooth, config_args.support_finetuning, config_args.support_controlnet, config_args.support_dropout
@@ -686,10 +698,10 @@ def load_user_config(file: str) -> dict:
 
     logger.info("")
     logger.info("[sanitized_user_config]")
-    logger.info(f'{sanitized_user_config}')
+    logger.info(f"{sanitized_user_config}")
 
     blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
 
     logger.info("")
     logger.info("[blueprint]")
-    logger.info(f'{blueprint}')
+    logger.info(f"{blueprint}")
diff --git a/library/train_util.py b/library/train_util.py
index b71e4edc6..7fe5bc56e 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1810,6 +1810,9 @@ def __init__(
 
         db_subsets = []
         for subset in subsets:
+            assert (
+                not subset.random_crop
+            ), "random_crop is not supported in ControlNetDataset / random_cropはControlNetDatasetではサポートされていません"
             db_subset = DreamBoothSubset(
                 subset.image_dir,
                 False,
diff --git a/train_network.py b/train_network.py
index e5b26d8a2..e3ce7bd36 100644
--- a/train_network.py
+++ b/train_network.py
@@ -13,6 +13,7 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -157,7 +158,7 @@ def train(self, args):
 
         # データセットを準備する
         if args.dataset_class is None:
-            blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, False, True))
+            blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
             if use_user_config:
                 logger.info(f"Loading dataset config from {args.dataset_config}")
                 user_config = config_util.load_user_config(args.dataset_config)
@@ -834,6 +835,16 @@ def remove_model(old_ckpt_name):
                         target = noise
 
                     loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+
+                    if args.masked_loss:
+                        # mask image is -1 to 1. we need to convert it to 0 to 1
+                        mask_image = batch["conditioning_images"].to(dtype=weight_dtype)[:, 0].unsqueeze(1)  # use R channel
+
+                        # resize to the same size as the loss
+                        mask_image = torch.nn.functional.interpolate(mask_image, size=loss.shape[2:], mode="area")
+                        mask_image = mask_image / 2 + 0.5
+                        loss = loss * mask_image
+
                     loss = loss.mean([1, 2, 3])
 
                     loss_weights = batch["loss_weights"]  # 各sampleごとのweight
@@ -1050,6 +1061,11 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
     )
+    parser.add_argument(
+        "--masked_loss",
+        action="store_true",
+        help="apply mask for caclulating loss. conditioning_data_dir is required for dataset. / 損失計算時にマスクを適用する。datasetにはconditioning_data_dirが必要",
+    )
     return parser
 
 

From 175193623b39027ffcfe0c0ae250dbce564ed6ef Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 26 Feb 2024 23:29:41 +0900
Subject: [PATCH 013/132] update readme

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index e1b6a26c3..9cc79cc09 100644
--- a/README.md
+++ b/README.md
@@ -249,6 +249,13 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 
 ## Change History
 
+### Masked loss
+
+`train_network.py` and `sdxl_train_network.py` now support the masked loss. `--masked_loss` option is added. 
+
+ControlNet dataset is used to specify the mask. The mask images should be the RGB images. The pixel value 255 in R channel is treated as the mask (the loss is calculated only for the pixels with the mask), and 0 is treated as the non-mask. See details for the dataset specification in the [LLLite documentation](./docs/train_lllite_README.md#preparing-the-dataset).
+
+
 ### Working in progress
 
 - `train_network.py` and `sdxl_train_network.py` are modified to record some dataset settings in the metadata of the trained model (`caption_prefix`, `caption_suffix`, `keep_tokens_separator`, `secondary_separator`, `enable_wildcard`).

From 4a5546d40e6de5789be78dd16373d2b820b8754e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 26 Feb 2024 23:39:56 +0900
Subject: [PATCH 014/132] fix typo

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index e3ce7bd36..f5617986c 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1064,7 +1064,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--masked_loss",
         action="store_true",
-        help="apply mask for caclulating loss. conditioning_data_dir is required for dataset. / 損失計算時にマスクを適用する。datasetにはconditioning_data_dirが必要",
+        help="apply mask for calculating loss. conditioning_data_dir is required for dataset. / 損失計算時にマスクを適用する。datasetにはconditioning_data_dirが必要",
     )
     return parser
 

From e3ccf8fbf73a0f728fc167a20b1e0648a3604f41 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 27 Feb 2024 21:30:46 +0900
Subject: [PATCH 015/132] make deepspeed_utils

---
 fine_tune.py               |  35 +++++-----
 library/deepspeed_utils.py | 139 +++++++++++++++++++++++++++++++++++++
 library/train_util.py      | 110 ++---------------------------
 sdxl_train.py              |  64 ++++++++---------
 train_db.py                |  39 ++++++-----
 train_network.py           |  51 +++++++-------
 6 files changed, 238 insertions(+), 200 deletions(-)
 create mode 100644 library/deepspeed_utils.py

diff --git a/fine_tune.py b/fine_tune.py
index c5e97d267..b018a933d 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -10,7 +10,9 @@
 from tqdm import tqdm
 
 import torch
+from library import deepspeed_utils
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from accelerate.utils import set_seed
@@ -42,6 +44,7 @@
 def train(args):
     train_util.verify_training_args(args)
     train_util.prepare_dataset_args(args, True)
+    deepspeed_utils.prepare_deepspeed_args(args)
     setup_logging(args, reset=True)
 
     cache_latents = args.cache_latents
@@ -219,7 +222,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         batch_size=1,
         shuffle=True,
         collate_fn=collator,
-        num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
+        num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
 
@@ -231,7 +234,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         accelerator.print(
             f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}"
         )
-        
+
     # データセット側にも学習ステップを送信
     train_dataset_group.set_max_train_steps(args.max_train_steps)
 
@@ -248,21 +251,16 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         text_encoder.to(weight_dtype)
 
     if args.deepspeed:
-        training_models_dict = {}
-        training_models_dict["unet"] = unet
-        if args.train_text_encoder: training_models_dict["text_encoder"] = text_encoder
-
-        ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict)
-        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
-    
-        training_models = []
-        unet = ds_model.models["unet"]
-        training_models.append(unet)
         if args.train_text_encoder:
-            text_encoder = ds_model.models["text_encoder"]
-            training_models.append(text_encoder)
-            
-    else: # acceleratorがなんかよろしくやってくれるらしい
+            ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet, text_encoder=text_encoder)
+        else:
+            ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet)
+        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            ds_model, optimizer, train_dataloader, lr_scheduler
+        )
+        training_models = [ds_model]
+    else:
+        # acceleratorがなんかよろしくやってくれるらしい
         if args.train_text_encoder:
             unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
                 unet, text_encoder, optimizer, train_dataloader, lr_scheduler
@@ -327,13 +325,13 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
-            with accelerator.accumulate(training_models[0]):  # 複数モデルに対応していない模様だがとりあえずこうしておく
+            with accelerator.accumulate(*training_models):
                 with torch.no_grad():
                     if "latents" in batch and batch["latents"] is not None:
                         latents = batch["latents"].to(accelerator.device)  # .to(dtype=weight_dtype)
                     else:
                         # latentに変換
-                        latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
+                        latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(weight_dtype)
                     latents = latents * 0.18215
                 b_size = latents.shape[0]
 
@@ -493,6 +491,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, False, True, True)
     train_util.add_training_arguments(parser, False)
+    deepspeed_utils.add_deepspeed_arguments(parser)
     train_util.add_sd_saving_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
diff --git a/library/deepspeed_utils.py b/library/deepspeed_utils.py
new file mode 100644
index 000000000..99a7b2b3b
--- /dev/null
+++ b/library/deepspeed_utils.py
@@ -0,0 +1,139 @@
+import os
+import argparse
+import torch
+from accelerate import DeepSpeedPlugin, Accelerator
+
+from .utils import setup_logging
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def add_deepspeed_arguments(parser: argparse.ArgumentParser):
+    # DeepSpeed Arguments. https://huggingface.co/docs/accelerate/usage_guides/deepspeed
+    parser.add_argument("--deepspeed", action="store_true", help="enable deepspeed training")
+    parser.add_argument("--zero_stage", type=int, default=2, choices=[0, 1, 2, 3], help="Possible options are 0,1,2,3.")
+    parser.add_argument(
+        "--offload_optimizer_device",
+        type=str,
+        default=None,
+        choices=[None, "cpu", "nvme"],
+        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3.",
+    )
+    parser.add_argument(
+        "--offload_optimizer_nvme_path",
+        type=str,
+        default=None,
+        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--offload_param_device",
+        type=str,
+        default=None,
+        choices=[None, "cpu", "nvme"],
+        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--offload_param_nvme_path",
+        type=str,
+        default=None,
+        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.",
+    )
+    parser.add_argument(
+        "--zero3_init_flag",
+        action="store_true",
+        help="Flag to indicate whether to enable `deepspeed.zero.Init` for constructing massive models."
+        "Only applicable with ZeRO Stage-3.",
+    )
+    parser.add_argument(
+        "--zero3_save_16bit_model",
+        action="store_true",
+        help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3.",
+    )
+    parser.add_argument(
+        "--fp16_master_weights_and_gradients",
+        action="store_true",
+        help="fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32.",
+    )
+
+
+def prepare_deepspeed_args(args: argparse.Namespace):
+    if not args.deepspeed:
+        return
+
+    # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
+    args.max_data_loader_n_workers = 1
+
+
+def prepare_deepspeed_plugin(args: argparse.Namespace):
+    if not args.deepspeed:
+        return None
+
+    try:
+        import deepspeed
+    except ImportError as e:
+        logger.error(
+            "deepspeed is not installed. please install deepspeed in your environment with following command. DS_BUILD_OPS=0 pip install deepspeed"
+        )
+        exit(1)
+
+    deepspeed_plugin = DeepSpeedPlugin(
+        zero_stage=args.zero_stage,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        gradient_clipping=args.max_grad_norm,
+        offload_optimizer_device=args.offload_optimizer_device,
+        offload_optimizer_nvme_path=args.offload_optimizer_nvme_path,
+        offload_param_device=args.offload_param_device,
+        offload_param_nvme_path=args.offload_param_nvme_path,
+        zero3_init_flag=args.zero3_init_flag,
+        zero3_save_16bit_model=args.zero3_save_16bit_model,
+    )
+    deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = args.train_batch_size
+    deepspeed_plugin.deepspeed_config["train_batch_size"] = (
+        args.train_batch_size * args.gradient_accumulation_steps * int(os.environ["WORLD_SIZE"])
+    )
+    deepspeed_plugin.set_mixed_precision(args.mixed_precision)
+    if args.mixed_precision.lower() == "fp16":
+        deepspeed_plugin.deepspeed_config["fp16"]["initial_scale_power"] = 0  # preventing overflow.
+    if args.full_fp16 or args.fp16_master_weights_and_gradients:
+        if args.offload_optimizer_device == "cpu" and args.zero_stage == 2:
+            deepspeed_plugin.deepspeed_config["fp16"]["fp16_master_weights_and_grads"] = True
+            logger.info("[DeepSpeed] full fp16 enable.")
+        else:
+            logger.info(
+                "[DeepSpeed]full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam on ZeRO-2 stage."
+            )
+
+    if args.offload_optimizer_device is not None:
+        logger.info("[DeepSpeed] start to manually build cpu_adam.")
+        deepspeed.ops.op_builder.CPUAdamBuilder().load()
+        logger.info("[DeepSpeed] building cpu_adam done.")
+
+    return deepspeed_plugin
+
+
+# Accelerate library does not support multiple models for deepspeed. So, we need to wrap multiple models into a single model.
+def prepare_deepspeed_model(args: argparse.Namespace, **models):
+    # remove None from models
+    models = {k: v for k, v in models.items() if v is not None}
+
+    class DeepSpeedWrapper(torch.nn.Module):
+        def __init__(self, **kw_models) -> None:
+            super().__init__()
+            self.models = torch.nn.ModuleDict()
+
+            for key, model in kw_models.items():
+                if isinstance(model, list):
+                    model = torch.nn.ModuleList(model)
+                assert isinstance(
+                    model, torch.nn.Module
+                ), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}"
+                self.models.update(torch.nn.ModuleDict({key: model}))
+
+        def get_models(self):
+            return self.models
+
+    ds_model = DeepSpeedWrapper(**models)
+    return ds_model
diff --git a/library/train_util.py b/library/train_util.py
index 3781dcde8..38e1b458d 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -21,7 +21,6 @@
     Union,
 )
 from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs
-from accelerate import DeepSpeedPlugin
 import glob
 import math
 import os
@@ -70,6 +69,7 @@
 import library.model_util as model_util
 import library.huggingface_util as huggingface_util
 import library.sai_model_spec as sai_model_spec
+import library.deepspeed_utils as deepspeed_utils
 from library.utils import setup_logging
 
 setup_logging()
@@ -3243,52 +3243,6 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
             "--prior_loss_weight", type=float, default=1.0, help="loss weight for regularization images / 正則化画像のlossの重み"
         )
 
-    # DeepSpeed Arguments. https://huggingface.co/docs/accelerate/usage_guides/deepspeed
-    parser.add_argument("--deepspeed", action="store_true", help="enable deepspeed training")
-    parser.add_argument(
-        "--zero_stage", 
-        type=int, default=2,
-        choices=[0, 1, 2, 3],
-        help="Possible options are 0,1,2,3."
-    )
-    parser.add_argument(
-        "--offload_optimizer_device", 
-        type=str, default=None,
-        choices=[None, "cpu", "nvme"],
-        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3."
-    )
-    parser.add_argument(
-        "--offload_optimizer_nvme_path",
-        type=str, default=None,
-        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3."
-    )
-    parser.add_argument(
-        "--offload_param_device",
-        type=str, default=None,
-        choices=[None, "cpu", "nvme"],
-        help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3."
-    )
-    parser.add_argument(
-        "--offload_param_nvme_path",
-        type=str, default=None,
-        help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3."
-    )
-    parser.add_argument(
-        "--zero3_init_flag",
-        action="store_true",
-        help="Flag to indicate whether to enable `deepspeed.zero.Init` for constructing massive models."
-            "Only applicable with ZeRO Stage-3."
-    )
-    parser.add_argument(
-        "--zero3_save_16bit_model",
-        action="store_true",
-        help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3."
-    )
-    parser.add_argument(
-        "--fp16_master_weights_and_gradients",
-        action="store_true",
-        help="fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32."
-    )
 
 def verify_training_args(args: argparse.Namespace):
     r"""
@@ -4090,6 +4044,10 @@ def load_tokenizer(args: argparse.Namespace):
 
 
 def prepare_accelerator(args: argparse.Namespace):
+    """
+    this function also prepares deepspeed plugin
+    """
+
     if args.logging_dir is None:
         logging_dir = None
     else:
@@ -4135,7 +4093,7 @@ def prepare_accelerator(args: argparse.Namespace):
         ),
     )
     kwargs_handlers = list(filter(lambda x: x is not None, kwargs_handlers))
-    deepspeed_plugin = prepare_deepspeed_plugin(args)
+    deepspeed_plugin = deepspeed_utils.prepare_deepspeed_plugin(args)
 
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
@@ -4149,62 +4107,6 @@ def prepare_accelerator(args: argparse.Namespace):
     print("accelerator device:", accelerator.device)
     return accelerator
 
-def prepare_deepspeed_plugin(args: argparse.Namespace):
-    if args.deepspeed is None: return None
-    try:
-        import deepspeed
-    except ImportError as e:
-        print("deepspeed is not installed. please install deepspeed in your environment with following command. DS_BUILD_OPS=0 pip install deepspeed")
-        exit(1)
-
-    deepspeed_plugin = DeepSpeedPlugin(
-        zero_stage=args.zero_stage,
-        gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_clipping=args.max_grad_norm,
-        offload_optimizer_device=args.offload_optimizer_device, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path,
-        offload_param_device=args.offload_param_device, offload_param_nvme_path=args.offload_param_nvme_path,
-        zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model,
-    )
-    deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size
-    deepspeed_plugin.deepspeed_config['train_batch_size'] = \
-        args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE'])
-    deepspeed_plugin.set_mixed_precision(args.mixed_precision)
-    if args.mixed_precision.lower() == "fp16":
-        deepspeed_plugin.deepspeed_config['fp16']['initial_scale_power'] = 0 # preventing overflow.
-    if args.full_fp16 or args.fp16_master_weights_and_gradients:
-        if args.offload_optimizer_device == "cpu" and args.zero_stage == 2:
-            deepspeed_plugin.deepspeed_config['fp16']['fp16_master_weights_and_grads'] = True
-            print("[DeepSpeed] full fp16 enable.")
-        else:
-            print("[DeepSpeed]full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam on ZeRO-2 stage.")
-    
-    if args.offload_optimizer_device is not None:
-        print('[DeepSpeed] start to manually build cpu_adam.')
-        deepspeed.ops.op_builder.CPUAdamBuilder().load()
-        print('[DeepSpeed] building cpu_adam done.')
-
-    return deepspeed_plugin
-
-def prepare_deepspeed_model(args: argparse.Namespace, **models):
-    class DeepSpeedWrapper(torch.nn.Module):
-        def __init__(self, **kw_models) -> None:
-            super().__init__()
-            self.models = torch.nn.ModuleDict()
-
-            for key, model in kw_models.items():
-                if isinstance(model, list):
-                    model = torch.nn.ModuleList(model)
-                assert isinstance(model, torch.nn.Module), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}"
-                self.models.update(
-                    torch.nn.ModuleDict(
-                        {key: model}
-                    )
-                )
-
-        def get_models(self):
-            return self.models
-
-    ds_model = DeepSpeedWrapper(**models)
-    return ds_model
 
 def prepare_dtype(args: argparse.Namespace):
     weight_dtype = torch.float32
diff --git a/sdxl_train.py b/sdxl_train.py
index 5e5e9f291..0feb4e367 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -11,11 +11,12 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
-from library import sdxl_model_util
+from library import deepspeed_utils, sdxl_model_util
 
 import library.train_util as train_util
 
@@ -97,6 +98,7 @@ def train(args):
     train_util.verify_training_args(args)
     train_util.prepare_dataset_args(args, True)
     sdxl_train_util.verify_sdxl_training_args(args)
+    deepspeed_utils.prepare_deepspeed_args(args)
     setup_logging(args, reset=True)
 
     assert (
@@ -361,7 +363,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         batch_size=1,
         shuffle=True,
         collate_fn=collator,
-        num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
+        num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
 
@@ -398,41 +400,31 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         text_encoder1.to(weight_dtype)
         text_encoder2.to(weight_dtype)
 
+    # freeze last layer and final_layer_norm in te1 since we use the output of the penultimate layer
+    if train_text_encoder1:
+        text_encoder1.text_model.encoder.layers[-1].requires_grad_(False)
+        text_encoder1.text_model.final_layer_norm.requires_grad_(False)
+
     if args.deepspeed:
-        training_models_dict = {}
-        if train_unet:
-            training_models_dict["unet"] = unet
-        if train_text_encoder1:
-            text_encoder1.text_model.encoder.layers[-1].requires_grad_(False)
-            text_encoder1.text_model.final_layer_norm.requires_grad_(False)
-            training_models_dict["text_encoder1"] = text_encoder1
-        if train_text_encoder2:
-            training_models_dict["text_encoder2"] = text_encoder2
-        ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict)
-        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
-        
-        training_models = [] # override training_models
-        if train_unet:
-            unet = ds_model.models["unet"]
-            training_models.append(unet)
-        if train_text_encoder1:
-            text_encoder1 = ds_model.models["text_encoder1"]
-            training_models.append(text_encoder1)
-        if train_text_encoder2:
-            text_encoder2 = ds_model.models["text_encoder2"]
-            training_models.append(text_encoder2)
+        ds_model = deepspeed_utils.prepare_deepspeed_model(
+            args,
+            unet=unet if train_unet else None,
+            text_encoder1=text_encoder1 if train_text_encoder1 else None,
+            text_encoder2=text_encoder2 if train_text_encoder2 else None,
+        )
+        ds_model = accelerator.prepare(ds_model)
+        training_models = [ds_model]
 
-    else: # acceleratorがなんかよろしくやってくれるらしい
+    else:
+        # acceleratorがなんかよろしくやってくれるらしい
         if train_unet:
             unet = accelerator.prepare(unet)
         if train_text_encoder1:
-            # freeze last layer and final_layer_norm in te1 since we use the output of the penultimate layer
-            text_encoder1.text_model.encoder.layers[-1].requires_grad_(False)
-            text_encoder1.text_model.final_layer_norm.requires_grad_(False)
             text_encoder1 = accelerator.prepare(text_encoder1)
         if train_text_encoder2:
             text_encoder2 = accelerator.prepare(text_encoder2)
-        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+
+    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
     # TextEncoderの出力をキャッシュするときにはCPUへ移動する
     if args.cache_text_encoder_outputs:
@@ -446,8 +438,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         text_encoder2.to(accelerator.device)
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
-    if args.full_fp16 and not args.deepspeed:
+    if args.full_fp16:
         # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
+        # -> But we think it's ok to patch accelerator even if deepspeed is enabled.
         train_util.patch_accelerator_for_fp16_training(accelerator)
 
     # resumeする
@@ -508,10 +501,10 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
             with accelerator.accumulate(*training_models):
-                with torch.no_grad(): # why this block differ within train_network.py?
-                    if "latents" in batch and batch["latents"] is not None:
-                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
-                    else:
+                if "latents" in batch and batch["latents"] is not None:
+                    latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
+                else:
+                    with torch.no_grad():
                         # latentに変換
                         latents = vae.encode(batch["images"].to(vae_dtype)).latent_dist.sample().to(weight_dtype)
 
@@ -519,7 +512,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                         if torch.any(torch.isnan(latents)):
                             accelerator.print("NaN found in latents, replacing with zeros")
                             latents = torch.nan_to_num(latents, 0, out=latents)
-                    latents = latents * sdxl_model_util.VAE_SCALE_FACTOR
+                latents = latents * sdxl_model_util.VAE_SCALE_FACTOR
 
                 if "text_encoder_outputs1_list" not in batch or batch["text_encoder_outputs1_list"] is None:
                     input_ids1 = batch["input_ids"]
@@ -768,6 +761,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_training_arguments(parser, False)
+    deepspeed_utils.add_deepspeed_arguments(parser)
     train_util.add_sd_saving_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
diff --git a/train_db.py b/train_db.py
index 66a83d1df..ea1cfeb8a 100644
--- a/train_db.py
+++ b/train_db.py
@@ -11,7 +11,9 @@
 from tqdm import tqdm
 
 import torch
+from library import deepspeed_utils
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from accelerate.utils import set_seed
@@ -46,6 +48,7 @@
 def train(args):
     train_util.verify_training_args(args)
     train_util.prepare_dataset_args(args, False)
+    deepspeed_utils.prepare_deepspeed_args(args)
     setup_logging(args, reset=True)
 
     cache_latents = args.cache_latents
@@ -187,7 +190,7 @@ def train(args):
         batch_size=1,
         shuffle=True,
         collate_fn=collator,
-        num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
+        num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
 
@@ -220,30 +223,27 @@ def train(args):
 
     # acceleratorがなんかよろしくやってくれるらしい
     if args.deepspeed:
-        training_models_dict = {}
-        training_models_dict["unet"] = unet
-        if train_text_encoder: training_models_dict["text_encoder"] = text_encoder
-
-        ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict)
-        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
-    
-        training_models = []
-        unet = ds_model.models["unet"]
-        training_models.append(unet)
-        if train_text_encoder:
-            text_encoder = ds_model.models["text_encoder"]
-            training_models.append(text_encoder)
-            
+        if args.train_text_encoder:
+            ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet, text_encoder=text_encoder)
+        else:
+            ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet)
+        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            ds_model, optimizer, train_dataloader, lr_scheduler
+        )
+        training_models = [ds_model]
+
     else:
         if train_text_encoder:
             unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
                 unet, text_encoder, optimizer, train_dataloader, lr_scheduler
             )
+            training_models = [unet, text_encoder]
         else:
             unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
+            training_models = [unet]
 
-        if not train_text_encoder:
-            text_encoder.to(accelerator.device, dtype=weight_dtype)  # to avoid 'cpu' vs 'cuda' error
+    if not train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)  # to avoid 'cpu' vs 'cuda' error
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
     if args.full_fp16:
@@ -312,8 +312,10 @@ def train(args):
                 if not args.gradient_checkpointing:
                     text_encoder.train(False)
                 text_encoder.requires_grad_(False)
+                if len(training_models) == 2:
+                    training_models = training_models[0]  # remove text_encoder from training_models
 
-            with accelerator.accumulate(unet):
+            with accelerator.accumulate(*training_models):
                 with torch.no_grad():
                     # latentに変換
                     if cache_latents:
@@ -480,6 +482,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, True, False, True)
     train_util.add_training_arguments(parser, True)
+    deepspeed_utils.add_deepspeed_arguments(parser)
     train_util.add_sd_saving_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
diff --git a/train_network.py b/train_network.py
index af1b7f635..a6ce169a9 100644
--- a/train_network.py
+++ b/train_network.py
@@ -13,13 +13,14 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from torch.nn.parallel import DistributedDataParallel as DDP
 
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
-from library import model_util
+from library import deepspeed_utils, model_util
 
 import library.train_util as train_util
 from library.train_util import (
@@ -141,6 +142,7 @@ def train(self, args):
         training_started_at = time.time()
         train_util.verify_training_args(args)
         train_util.prepare_dataset_args(args, True)
+        deepspeed_utils.prepare_deepspeed_args(args)
         setup_logging(args, reset=True)
 
         cache_latents = args.cache_latents
@@ -357,7 +359,7 @@ def train(self, args):
             batch_size=1,
             shuffle=True,
             collate_fn=collator,
-            num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1.
+            num_workers=n_workers,
             persistent_workers=args.persistent_data_loader_workers,
         )
 
@@ -414,22 +416,17 @@ def train(self, args):
 
         # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good
         if args.deepspeed:
-            training_models_dict = {}
-            if train_unet: training_models_dict["unet"] = unet
-            if train_text_encoder: training_models_dict["text_encoder"] = text_encoders
-            training_models_dict["network"] = network
-
-            ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict)
-            ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler)
-            
-            if train_unet: unet = ds_model.models["unet"]
-            if train_text_encoder:
-                text_encoder = ds_model.models["text_encoder"]
-                if len(ds_model.models["text_encoder"]) > 1:
-                    text_encoders = text_encoder
-                else:
-                    text_encoders = [text_encoder]
-
+            ds_model = deepspeed_utils.prepare_deepspeed_model(
+                args,
+                unet=unet if train_unet else None,
+                text_encoder1=text_encoders[0] if train_text_encoder else None,
+                text_encoder2=text_encoders[1] if train_text_encoder and len(text_encoders) > 1 else None,
+                network=network,
+            )
+            ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                ds_model, optimizer, train_dataloader, lr_scheduler
+            )
+            training_model = ds_model
         else:
             if train_unet:
                 unet = accelerator.prepare(unet)
@@ -444,7 +441,10 @@ def train(self, args):
             else:
                 pass  # if text_encoder is not trained, no need to prepare. and device and dtype are already set
 
-            network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler)
+            network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                network, optimizer, train_dataloader, lr_scheduler
+            )
+            training_model = network
 
         if args.gradient_checkpointing:
             # according to TI example in Diffusers, train is required
@@ -777,13 +777,13 @@ def remove_model(old_ckpt_name):
 
             for step, batch in enumerate(train_dataloader):
                 current_step.value = global_step
-                with accelerator.accumulate(network):
+                with accelerator.accumulate(training_model):
                     on_step_start(text_encoder, unet)
 
-                    with torch.no_grad():
-                        if "latents" in batch and batch["latents"] is not None:
-                            latents = batch["latents"].to(accelerator.device)
-                        else:
+                    if "latents" in batch and batch["latents"] is not None:
+                        latents = batch["latents"].to(accelerator.device)
+                    else:
+                        with torch.no_grad():
                             # latentに変換
                             latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample()
 
@@ -791,7 +791,7 @@ def remove_model(old_ckpt_name):
                             if torch.any(torch.isnan(latents)):
                                 accelerator.print("NaN found in latents, replacing with zeros")
                                 latents = torch.nan_to_num(latents, 0, out=latents)
-                        latents = latents * self.vae_scale_factor
+                    latents = latents * self.vae_scale_factor
 
                     # get multiplier for each sample
                     if network_has_multiplier:
@@ -976,6 +976,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_training_arguments(parser, True)
+    deepspeed_utils.add_deepspeed_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser)

From a9b64ffba8efbb0991a094e38b1f5d5c56680caf Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 27 Feb 2024 21:43:55 +0900
Subject: [PATCH 016/132] support masked loss in sdxl_train ref #589

---
 README.md     |  4 +++-
 sdxl_train.py | 20 +++++++++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9cc79cc09..354983c38 100644
--- a/README.md
+++ b/README.md
@@ -251,7 +251,9 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 
 ### Masked loss
 
-`train_network.py` and `sdxl_train_network.py` now support the masked loss. `--masked_loss` option is added. 
+`train_network.py`, `sdxl_train_network.py` and `sdxl_train.py` now support the masked loss. `--masked_loss` option is added. 
+
+NOTE: `train_network.py` and `sdxl_train.py` are not tested yet.
 
 ControlNet dataset is used to specify the mask. The mask images should be the RGB images. The pixel value 255 in R channel is treated as the mask (the loss is calculated only for the pixels with the mask), and 0 is treated as the non-mask. See details for the dataset specification in the [LLLite documentation](./docs/train_lllite_README.md#preparing-the-dataset).
 
diff --git a/sdxl_train.py b/sdxl_train.py
index e0df263d6..448a160f6 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -11,6 +11,7 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from accelerate.utils import set_seed
@@ -124,7 +125,7 @@ def train(args):
 
     # データセットを準備する
     if args.dataset_class is None:
-        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, False, True))
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
         if args.dataset_config is not None:
             logger.info(f"Load dataset config from {args.dataset_config}")
             user_config = config_util.load_user_config(args.dataset_config)
@@ -579,6 +580,16 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                 ):
                     # do not mean over batch dimension for snr weight or scale v-pred loss
                     loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+
+                    if args.masked_loss:
+                        # mask image is -1 to 1. we need to convert it to 0 to 1
+                        mask_image = batch["conditioning_images"].to(dtype=weight_dtype)[:, 0].unsqueeze(1)  # use R channel
+
+                        # resize to the same size as the loss
+                        mask_image = torch.nn.functional.interpolate(mask_image, size=loss.shape[2:], mode="area")
+                        mask_image = mask_image / 2 + 0.5
+                        loss = loss * mask_image
+
                     loss = loss.mean([1, 2, 3])
 
                     if args.min_snr_gamma:
@@ -780,6 +791,13 @@ def setup_parser() -> argparse.ArgumentParser:
         + f"U-Netの各ブロックの学習率、カンマ区切り、{UNET_NUM_BLOCKS_FOR_BLOCK_LR}個の値",
     )
 
+    # TODO common masked_loss argument
+    parser.add_argument(
+        "--masked_loss",
+        action="store_true",
+        help="apply mask for calculating loss. conditioning_data_dir is required for dataset. / 損失計算時にマスクを適用する。datasetにはconditioning_data_dirが必要",
+    )
+
     return parser
 
 

From 124ec45876a9f07820b42fda0d7ca9019de773d5 Mon Sep 17 00:00:00 2001
From: Horizon1704 <92718180+Horizon1704@users.noreply.github.com>
Date: Sun, 10 Mar 2024 22:53:05 +0800
Subject: [PATCH 017/132] Add "encoding='utf-8'"

---
 library/train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index d2b69edb5..5f23dd13f 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3474,7 +3474,7 @@ def read_config_from_file(args: argparse.Namespace, parser: argparse.ArgumentPar
         exit(1)
 
     logger.info(f"Loading settings from {config_path}...")
-    with open(config_path, "r") as f:
+    with open(config_path, "r", encoding='utf-8') as f:
         config_dict = toml.load(f)
 
     # combine all sections into one

From 095b8035e63f7c79a232114d8f0e1ec27f431ebc Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Sun, 10 Mar 2024 23:33:38 +0800
Subject: [PATCH 018/132] save state on train end

---
 fine_tune.py                     | 2 +-
 library/train_util.py            | 5 +++++
 sdxl_train.py                    | 2 +-
 sdxl_train_control_net_lllite.py | 2 +-
 train_controlnet.py              | 2 +-
 train_db.py                      | 2 +-
 train_network.py                 | 2 +-
 train_textual_inversion.py       | 2 +-
 train_textual_inversion_XTI.py   | 2 +-
 9 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 875a91951..46f128287 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -457,7 +457,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     accelerator.end_training()
 
-    if args.save_state and is_main_process:
+    if is_main_process and (args.save_state or args.save_state_on_train_end):        
         train_util.save_state_on_train_end(args, accelerator)
 
     del accelerator  # この後メモリを使うのでこれは消す
diff --git a/library/train_util.py b/library/train_util.py
index d2b69edb5..b3ca15f55 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2890,6 +2890,11 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         action="store_true",
         help="save training state additionally (including optimizer states etc.) / optimizerなど学習状態も含めたstateを追加で保存する",
     )
+    parser.add_argument(
+        "--save_state_on_train_end",
+        action="store_true",
+        help="save training state additionally (including optimizer states etc.) on train end / optimizerなど学習状態も含めたstateを追加で保存する",
+    )   
     parser.add_argument("--resume", type=str, default=None, help="saved state to resume training / 学習再開するモデルのstate")
 
     parser.add_argument("--train_batch_size", type=int, default=1, help="batch size for training / 学習時のバッチサイズ")
diff --git a/sdxl_train.py b/sdxl_train.py
index e0df263d6..107bb9451 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -712,7 +712,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     accelerator.end_training()
 
-    if args.save_state:  # and is_main_process:
+    if args.save_state or args.save_state_on_train_end:        
         train_util.save_state_on_train_end(args, accelerator)
 
     del accelerator  # この後メモリを使うのでこれは消す
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 1e5f92349..e99b4e35c 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -549,7 +549,7 @@ def remove_model(old_ckpt_name):
 
     accelerator.end_training()
 
-    if is_main_process and args.save_state:
+    if is_main_process and (args.save_state or args.save_state_on_train_end):
         train_util.save_state_on_train_end(args, accelerator)
 
     if is_main_process:
diff --git a/train_controlnet.py b/train_controlnet.py
index dc73a91c8..e44f08853 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -565,7 +565,7 @@ def remove_model(old_ckpt_name):
 
     accelerator.end_training()
 
-    if is_main_process and args.save_state:
+    if is_main_process and (args.save_state or args.save_state_on_train_end):
         train_util.save_state_on_train_end(args, accelerator)
 
     # del accelerator  # この後メモリを使うのでこれは消す→printで使うので消さずにおく
diff --git a/train_db.py b/train_db.py
index 8d36097a5..41a9a7b99 100644
--- a/train_db.py
+++ b/train_db.py
@@ -444,7 +444,7 @@ def train(args):
 
     accelerator.end_training()
 
-    if args.save_state and is_main_process:
+    if is_main_process and (args.save_state or args.save_state_on_train_end):
         train_util.save_state_on_train_end(args, accelerator)
 
     del accelerator  # この後メモリを使うのでこれは消す
diff --git a/train_network.py b/train_network.py
index e0fa69458..4707d5ae5 100644
--- a/train_network.py
+++ b/train_network.py
@@ -935,7 +935,7 @@ def remove_model(old_ckpt_name):
 
         accelerator.end_training()
 
-        if is_main_process and args.save_state:
+        if is_main_process and args.save_state or args.save_state_on_train_end:
             train_util.save_state_on_train_end(args, accelerator)
 
         if is_main_process:
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index df1d8485a..0266bc143 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -732,7 +732,7 @@ def remove_model(old_ckpt_name):
 
         accelerator.end_training()
 
-        if args.save_state and is_main_process:
+        if is_main_process and (args.save_state or args.save_state_on_train_end):
             train_util.save_state_on_train_end(args, accelerator)
 
         if is_main_process:
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 695fad2a8..ad7c267eb 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -586,7 +586,7 @@ def remove_model(old_ckpt_name):
 
     accelerator.end_training()
 
-    if args.save_state and is_main_process:
+    if is_main_process and (args.save_state or args.save_state_on_train_end):
         train_util.save_state_on_train_end(args, accelerator)
 
     updated_embs = text_encoder.get_input_embeddings().weight[token_ids_XTI].data.detach().clone()

From d282c450026dcfd5f1fd5856f5087ebaed47be46 Mon Sep 17 00:00:00 2001
From: gesen2egee <79357052+gesen2egee@users.noreply.github.com>
Date: Mon, 11 Mar 2024 23:56:09 +0800
Subject: [PATCH 019/132] Update train_network.py

---
 train_network.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 4707d5ae5..3db583f1d 100644
--- a/train_network.py
+++ b/train_network.py
@@ -935,7 +935,7 @@ def remove_model(old_ckpt_name):
 
         accelerator.end_training()
 
-        if is_main_process and args.save_state or args.save_state_on_train_end:
+        if is_main_process and (args.save_state or args.save_state_on_train_end):
             train_util.save_state_on_train_end(args, accelerator)
 
         if is_main_process:

From fe1292d5d347b099250a2b349df304dc8c608705 Mon Sep 17 00:00:00 2001
From: kblueleaf <apolloyeh0123@gmail.com>
Date: Tue, 12 Mar 2024 19:11:45 +0800
Subject: [PATCH 020/132] random ip_noise_gamma strength

---
 library/train_util.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index b71e4edc6..aa2d9b90b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3100,6 +3100,13 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         help="enable input perturbation noise. used for regularization. recommended value: around 0.1 (from arxiv.org/abs/2301.11706) "
         + "/  input perturbation noiseを有効にする。正則化に使用される。推奨値: 0.1程度 (arxiv.org/abs/2301.11706 より)",
     )
+    parser.add_argument(
+        "--ip_noise_gamma_random_strength",
+        type=bool,
+        default=False,
+        help="Use random strength between 0~ip_noise_gamma for input perturbation noise."
+        + "/ input perturbation noiseにおいて、0からip_noise_gammaの間でランダムな強度を使用します。",
+    )
     # parser.add_argument(
     #     "--perlin_noise",
     #     type=int,
@@ -4673,7 +4680,11 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     # Add noise to the latents according to the noise magnitude at each timestep
     # (this is the forward diffusion process)
     if args.ip_noise_gamma:
-        noisy_latents = noise_scheduler.add_noise(latents, noise + args.ip_noise_gamma * torch.randn_like(latents), timesteps)
+        if args.ip_noise_gamma_random_strength:
+            strength = torch.rand(1, device=latents.device) * args.ip_noise_gamma
+        else:
+            strength = args.ip_noise_gamma
+        noisy_latents = noise_scheduler.add_noise(latents, noise + strength * torch.randn_like(latents), timesteps)
     else:
         noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 

From 922002eaf01256bf5d89b147274b2fcdf3189cf3 Mon Sep 17 00:00:00 2001
From: kblueleaf <apolloyeh0123@gmail.com>
Date: Tue, 12 Mar 2024 19:14:01 +0800
Subject: [PATCH 021/132] random noise_offset strength

---
 library/train_util.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index aa2d9b90b..5282b5240 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3087,6 +3087,12 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         default=None,
         help="enable noise offset with this value (if enabled, around 0.1 is recommended) / Noise offsetを有効にしてこの値を設定する（有効にする場合は0.1程度を推奨）",
     )
+    parser.add_argument(
+        "--noise_offset_random_strength",
+        type=bool,
+        default=False,
+        help="use random strength between 0~noise_offset for noise offset. / noise offsetにおいて、0からnoise_offsetの間でランダムな強度を使用します。",
+    )
     parser.add_argument(
         "--multires_noise_iterations",
         type=int,
@@ -4663,7 +4669,11 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     # Sample noise that we'll add to the latents
     noise = torch.randn_like(latents, device=latents.device)
     if args.noise_offset:
-        noise = custom_train_functions.apply_noise_offset(latents, noise, args.noise_offset, args.adaptive_noise_scale)
+        if args.noise_offset_random_strength:
+            noise_offset = torch.rand(1, device=latents.device) * args.noise_offset
+        else:
+            noise_offset = args.noise_offset
+        noise = custom_train_functions.apply_noise_offset(latents, noise, noise_offset, args.adaptive_noise_scale)
     if args.multires_noise_iterations:
         noise = custom_train_functions.pyramid_noise_like(
             noise, latents.device, args.multires_noise_iterations, args.multires_noise_discount

From 7221ba1c204cc819a3c2cc26bcb5f80c657af31f Mon Sep 17 00:00:00 2001
From: kblueleaf <apolloyeh0123@gmail.com>
Date: Tue, 12 Mar 2024 19:24:27 +0800
Subject: [PATCH 022/132] use correct settings for parser

---
 library/train_util.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 5282b5240..73a768672 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3089,8 +3089,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
     )
     parser.add_argument(
         "--noise_offset_random_strength",
-        type=bool,
-        default=False,
+        action="store_true",
         help="use random strength between 0~noise_offset for noise offset. / noise offsetにおいて、0からnoise_offsetの間でランダムな強度を使用します。",
     )
     parser.add_argument(
@@ -3108,8 +3107,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
     )
     parser.add_argument(
         "--ip_noise_gamma_random_strength",
-        type=bool,
-        default=False,
+        action="store_true",
         help="Use random strength between 0~ip_noise_gamma for input perturbation noise."
         + "/ input perturbation noiseにおいて、0からip_noise_gammaの間でランダムな強度を使用します。",
     )

From 3511bcd109c233e3f8cab55138a640651051f9c9 Mon Sep 17 00:00:00 2001
From: kblueleaf <apolloyeh0123@gmail.com>
Date: Wed, 13 Mar 2024 18:14:56 +0800
Subject: [PATCH 023/132] support meta cached dataset

---
 library/train_util.py |  7 +++++--
 train_network.py      | 23 +++++++++++++++++++++--
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 73a768672..6c09c0a18 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -63,6 +63,7 @@
 from huggingface_hub import hf_hub_download
 import numpy as np
 from PIL import Image
+import imagesize
 import cv2
 import safetensors.torch
 from library.lpw_stable_diffusion import StableDiffusionLongPromptWeightingPipeline
@@ -1073,8 +1074,7 @@ def cache_text_encoder_outputs(
             )
 
     def get_image_size(self, image_path):
-        image = Image.open(image_path)
-        return image.size
+        return imagesize.get(image_path)
 
     def load_image_with_face_info(self, subset: BaseSubset, image_path: str):
         img = load_image(image_path)
@@ -3314,6 +3314,9 @@ def add_dataset_arguments(
     parser: argparse.ArgumentParser, support_dreambooth: bool, support_caption: bool, support_caption_dropout: bool
 ):
     # dataset common
+    parser.add_argument(
+        "--dataset_from_pkl", action="store_true"
+    )
     parser.add_argument(
         "--train_data_dir", type=str, default=None, help="directory for train images / 学習画像データのディレクトリ"
     )
diff --git a/train_network.py b/train_network.py
index e5b26d8a2..c204b4656 100644
--- a/train_network.py
+++ b/train_network.py
@@ -6,6 +6,7 @@
 import random
 import time
 import json
+import pickle
 from multiprocessing import Value
 import toml
 
@@ -23,7 +24,7 @@
 
 import library.train_util as train_util
 from library.train_util import (
-    DreamBoothDataset,
+    DreamBoothDataset, DatasetGroup
 )
 import library.config_util as config_util
 from library.config_util import (
@@ -156,7 +157,25 @@ def train(self, args):
         tokenizers = tokenizer if isinstance(tokenizer, list) else [tokenizer]
 
         # データセットを準備する
-        if args.dataset_class is None:
+        if args.dataset_from_pkl:
+            logger.info(f"Loading dataset from cached meta")
+            with open(f"{args.train_data_dir}/dataset-meta.pkl", "rb") as f:
+                train_dataset_group = pickle.load(f)
+            assert isinstance(train_dataset_group, DatasetGroup)
+            logger.info(f"Dataset Loaded")
+            logger.info(f"Dataset have {train_dataset_group.num_train_images} images")
+            logger.info(f"Dataset have {train_dataset_group.num_reg_images} reg images")
+            
+            # To simulate the correct behavior of random operations
+            # To avoid any potential to cause "seed breaking changes"
+            dataset_seed = random.randint(0, 2**31)
+            for dataset in train_dataset_group.datasets:
+                dataset.tokenizers = tokenizers
+                dataset.tokenizer_max_length = dataset.tokenizers[0].model_max_length if args.max_token_length is None else args.max_token_length + 2
+                dataset.set_seed(0)
+                dataset.shuffle_buckets()
+                dataset.set_seed(dataset_seed)
+        elif args.dataset_class is None:
             blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, False, True))
             if use_user_config:
                 logger.info(f"Loading dataset config from {args.dataset_config}")

From 46f511ea598d1ab7c4d25f66c064fe18fcd8fa7f Mon Sep 17 00:00:00 2001
From: kblueleaf <apolloyeh0123@gmail.com>
Date: Wed, 13 Mar 2024 18:15:17 +0800
Subject: [PATCH 024/132] add cache meta scripts

---
 cache_dataset_meta.py | 105 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 cache_dataset_meta.py

diff --git a/cache_dataset_meta.py b/cache_dataset_meta.py
new file mode 100644
index 000000000..6101d9394
--- /dev/null
+++ b/cache_dataset_meta.py
@@ -0,0 +1,105 @@
+import argparse
+import random
+import pickle
+
+from accelerate.utils import set_seed
+
+import library.train_util as train_util
+import library.config_util as config_util
+from library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+import library.custom_train_functions as custom_train_functions
+from library.utils import setup_logging, add_logging_arguments
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def make_dataset(args):
+    train_util.prepare_dataset_args(args, True)
+    setup_logging(args, reset=True)
+
+    use_dreambooth_method = args.in_json is None
+    use_user_config = args.dataset_config is not None
+
+    if args.seed is None:
+        args.seed = random.randint(0, 2**32)
+    set_seed(args.seed)
+
+    # データセットを準備する
+    if args.dataset_class is None:
+        blueprint_generator = BlueprintGenerator(
+            ConfigSanitizer(True, True, False, True)
+        )
+        if use_user_config:
+            logger.info(f"Loading dataset config from {args.dataset_config}")
+            user_config = config_util.load_user_config(args.dataset_config)
+            ignored = ["train_data_dir", "reg_data_dir", "in_json"]
+            if any(getattr(args, attr) is not None for attr in ignored):
+                logger.warning(
+                    "ignoring the following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                        ", ".join(ignored)
+                    )
+                )
+        else:
+            if use_dreambooth_method:
+                logger.info("Using DreamBooth method.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
+                                args.train_data_dir, args.reg_data_dir
+                            )
+                        }
+                    ]
+                }
+            else:
+                logger.info("Training with captions.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": [
+                                {
+                                    "image_dir": args.train_data_dir,
+                                    "metadata_file": args.in_json,
+                                }
+                            ]
+                        }
+                    ]
+                }
+
+        blueprint = blueprint_generator.generate(user_config, args, tokenizer=None)
+        train_dataset_group = config_util.generate_dataset_group_by_blueprint(
+            blueprint.dataset_group
+        )
+    else:
+        # use arbitrary dataset class
+        train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizer=None)
+    return train_dataset_group
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    add_logging_arguments(parser)
+    train_util.add_dataset_arguments(parser, True, True, True)
+    train_util.add_training_arguments(parser, True)
+    config_util.add_config_arguments(parser)
+    custom_train_functions.add_custom_train_arguments(parser)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args, unknown = parser.parse_known_args()
+    args = train_util.read_config_from_file(args, parser)
+    if args.max_token_length is None:
+        args.max_token_length = 75
+
+    dataset_group = make_dataset(args)
+    with open(f"{args.train_data_dir}/dataset-meta.pkl", "wb") as f:
+        pickle.dump(dataset_group, f)

From 948029fe61d9142f88374d6701223bf9f7ee5d47 Mon Sep 17 00:00:00 2001
From: kblueleaf <apolloyeh0123@gmail.com>
Date: Tue, 12 Mar 2024 19:11:45 +0800
Subject: [PATCH 025/132] random ip_noise_gamma strength

---
 library/train_util.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index b71e4edc6..aa2d9b90b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3100,6 +3100,13 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         help="enable input perturbation noise. used for regularization. recommended value: around 0.1 (from arxiv.org/abs/2301.11706) "
         + "/  input perturbation noiseを有効にする。正則化に使用される。推奨値: 0.1程度 (arxiv.org/abs/2301.11706 より)",
     )
+    parser.add_argument(
+        "--ip_noise_gamma_random_strength",
+        type=bool,
+        default=False,
+        help="Use random strength between 0~ip_noise_gamma for input perturbation noise."
+        + "/ input perturbation noiseにおいて、0からip_noise_gammaの間でランダムな強度を使用します。",
+    )
     # parser.add_argument(
     #     "--perlin_noise",
     #     type=int,
@@ -4673,7 +4680,11 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     # Add noise to the latents according to the noise magnitude at each timestep
     # (this is the forward diffusion process)
     if args.ip_noise_gamma:
-        noisy_latents = noise_scheduler.add_noise(latents, noise + args.ip_noise_gamma * torch.randn_like(latents), timesteps)
+        if args.ip_noise_gamma_random_strength:
+            strength = torch.rand(1, device=latents.device) * args.ip_noise_gamma
+        else:
+            strength = args.ip_noise_gamma
+        noisy_latents = noise_scheduler.add_noise(latents, noise + strength * torch.randn_like(latents), timesteps)
     else:
         noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 

From 86399407b2bb5a93d691846acfa88e7ba38ae70d Mon Sep 17 00:00:00 2001
From: kblueleaf <apolloyeh0123@gmail.com>
Date: Tue, 12 Mar 2024 19:14:01 +0800
Subject: [PATCH 026/132] random noise_offset strength

---
 library/train_util.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index aa2d9b90b..5282b5240 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3087,6 +3087,12 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         default=None,
         help="enable noise offset with this value (if enabled, around 0.1 is recommended) / Noise offsetを有効にしてこの値を設定する（有効にする場合は0.1程度を推奨）",
     )
+    parser.add_argument(
+        "--noise_offset_random_strength",
+        type=bool,
+        default=False,
+        help="use random strength between 0~noise_offset for noise offset. / noise offsetにおいて、0からnoise_offsetの間でランダムな強度を使用します。",
+    )
     parser.add_argument(
         "--multires_noise_iterations",
         type=int,
@@ -4663,7 +4669,11 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     # Sample noise that we'll add to the latents
     noise = torch.randn_like(latents, device=latents.device)
     if args.noise_offset:
-        noise = custom_train_functions.apply_noise_offset(latents, noise, args.noise_offset, args.adaptive_noise_scale)
+        if args.noise_offset_random_strength:
+            noise_offset = torch.rand(1, device=latents.device) * args.noise_offset
+        else:
+            noise_offset = args.noise_offset
+        noise = custom_train_functions.apply_noise_offset(latents, noise, noise_offset, args.adaptive_noise_scale)
     if args.multires_noise_iterations:
         noise = custom_train_functions.pyramid_noise_like(
             noise, latents.device, args.multires_noise_iterations, args.multires_noise_discount

From 53954a1e2e05648bae6eb479720402968029cd3d Mon Sep 17 00:00:00 2001
From: kblueleaf <apolloyeh0123@gmail.com>
Date: Tue, 12 Mar 2024 19:24:27 +0800
Subject: [PATCH 027/132] use correct settings for parser

---
 library/train_util.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 5282b5240..73a768672 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3089,8 +3089,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
     )
     parser.add_argument(
         "--noise_offset_random_strength",
-        type=bool,
-        default=False,
+        action="store_true",
         help="use random strength between 0~noise_offset for noise offset. / noise offsetにおいて、0からnoise_offsetの間でランダムな強度を使用します。",
     )
     parser.add_argument(
@@ -3108,8 +3107,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
     )
     parser.add_argument(
         "--ip_noise_gamma_random_strength",
-        type=bool,
-        default=False,
+        action="store_true",
         help="Use random strength between 0~ip_noise_gamma for input perturbation noise."
         + "/ input perturbation noiseにおいて、0からip_noise_gammaの間でランダムな強度を使用します。",
     )

From 7081a0cf0f1ca1a543edf7cab10c4c7d497348ca Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 17 Mar 2024 18:09:15 +0900
Subject: [PATCH 028/132] extension of src image could be different than target
 image

---
 library/train_util.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 7fe5bc56e..0f8cf9eea 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1863,7 +1863,7 @@ def __init__(
 
         # assert all conditioning data exists
         missing_imgs = []
-        cond_imgs_with_img = set()
+        cond_imgs_with_pair = set()
         for image_key, info in self.dreambooth_dataset_delegate.image_data.items():
             db_subset = self.dreambooth_dataset_delegate.image_to_subset[image_key]
             subset = None
@@ -1877,23 +1877,29 @@ def __init__(
                 logger.warning(f"not directory: {subset.conditioning_data_dir}")
                 continue
 
-            img_basename = os.path.basename(info.absolute_path)
-            ctrl_img_path = os.path.join(subset.conditioning_data_dir, img_basename)
-            if not os.path.exists(ctrl_img_path):
+            img_basename = os.path.splitext(os.path.basename(info.absolute_path))[0]
+            ctrl_img_path = glob_images(subset.conditioning_data_dir, img_basename)
+            if len(ctrl_img_path) < 1:
                 missing_imgs.append(img_basename)
+                continue
+            ctrl_img_path = ctrl_img_path[0]
+            ctrl_img_path = os.path.abspath(ctrl_img_path)  # normalize path
 
             info.cond_img_path = ctrl_img_path
-            cond_imgs_with_img.add(ctrl_img_path)
+            cond_imgs_with_pair.add(os.path.splitext(ctrl_img_path)[0])  # remove extension because Windows is case insensitive
 
         extra_imgs = []
         for subset in subsets:
             conditioning_img_paths = glob_images(subset.conditioning_data_dir, "*")
-            extra_imgs.extend(
-                [cond_img_path for cond_img_path in conditioning_img_paths if cond_img_path not in cond_imgs_with_img]
-            )
+            conditioning_img_paths = [os.path.abspath(p) for p in conditioning_img_paths]  # normalize path
+            extra_imgs.extend([p for p in conditioning_img_paths if os.path.splitext(p)[0] not in cond_imgs_with_pair])
 
-        assert len(missing_imgs) == 0, f"missing conditioning data for {len(missing_imgs)} images: {missing_imgs}"
-        assert len(extra_imgs) == 0, f"extra conditioning data for {len(extra_imgs)} images: {extra_imgs}"
+        assert (
+            len(missing_imgs) == 0
+        ), f"missing conditioning data for {len(missing_imgs)} images / 制御用画像が見つかりませんでした: {missing_imgs}"
+        assert (
+            len(extra_imgs) == 0
+        ), f"extra conditioning data for {len(extra_imgs)} images / 余分な制御用画像があります: {extra_imgs}"
 
         self.conditioning_image_transforms = IMAGE_TRANSFORMS
 

From 3419c3de0d0ff8cba1d74444ece23608614f3c5b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 17 Mar 2024 19:30:20 +0900
Subject: [PATCH 029/132] common masked loss func, apply to all training script

---
 docs/train_lllite_README-ja.md    |  8 ++++++--
 docs/train_lllite_README.md       |  4 +++-
 library/config_util.py            |  5 ++++-
 library/custom_train_functions.py | 24 ++++++++++++++++++++----
 library/train_util.py             | 16 ++++++++++++++++
 sdxl_train.py                     | 21 ++++-----------------
 train_db.py                       |  7 ++++++-
 train_network.py                  | 17 +++--------------
 train_textual_inversion.py        |  7 ++++++-
 train_textual_inversion_XTI.py    |  7 ++++++-
 10 files changed, 74 insertions(+), 42 deletions(-)

diff --git a/docs/train_lllite_README-ja.md b/docs/train_lllite_README-ja.md
index dbdc1fea2..1f6a78d5c 100644
--- a/docs/train_lllite_README-ja.md
+++ b/docs/train_lllite_README-ja.md
@@ -21,9 +21,13 @@ ComfyUIのカスタムノードを用意しています。: https://github.com/k
 ## モデルの学習
 
 ### データセットの準備
-通常のdatasetに加え、`conditioning_data_dir` で指定したディレクトリにconditioning imageを格納してください。conditioning imageは学習用画像と同じbasenameを持つ必要があります。また、conditioning imageは学習用画像と同じサイズに自動的にリサイズされます。conditioning imageにはキャプションファイルは不要です。
+DreamBooth 方式の dataset で、`conditioning_data_dir` で指定したディレクトリにconditioning imageを格納してください。
 
-たとえば DreamBooth 方式でキャプションファイルを用いる場合の設定ファイルは以下のようになります。
+（finetuning 方式の dataset はサポートしていません。）
+
+conditioning imageは学習用画像と同じbasenameを持つ必要があります。また、conditioning imageは学習用画像と同じサイズに自動的にリサイズされます。conditioning imageにはキャプションファイルは不要です。
+
+たとえば、キャプションにフォルダ名ではなくキャプションファイルを用いる場合の設定ファイルは以下のようになります。
 
 ```toml
 [[datasets.subsets]]
diff --git a/docs/train_lllite_README.md b/docs/train_lllite_README.md
index 04dc12da2..a05f87f5f 100644
--- a/docs/train_lllite_README.md
+++ b/docs/train_lllite_README.md
@@ -26,7 +26,9 @@ Due to the limitations of the inference environment, only CrossAttention (attn1
 
 ### Preparing the dataset
 
-In addition to the normal dataset, please store the conditioning image in the directory specified by `conditioning_data_dir`. The conditioning image must have the same basename as the training image. The conditioning image will be automatically resized to the same size as the training image. The conditioning image does not require a caption file.
+In addition to the normal DreamBooth method dataset, please store the conditioning image in the directory specified by `conditioning_data_dir`. The conditioning image must have the same basename as the training image. The conditioning image will be automatically resized to the same size as the training image. The conditioning image does not require a caption file.
+
+(We do not support the finetuning method dataset.)
 
 ```toml
 [[datasets.subsets]]
diff --git a/library/config_util.py b/library/config_util.py
index edc6a5385..26daeb472 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -323,7 +323,10 @@ def validate_flex_dataset(dataset_config: dict):
 
             self.dataset_schema = validate_flex_dataset
         elif support_dreambooth:
-            self.dataset_schema = self.db_dataset_schema
+            if support_controlnet:
+                self.dataset_schema = self.cn_dataset_schema
+            else:
+                self.dataset_schema = self.db_dataset_schema
         elif support_finetuning:
             self.dataset_schema = self.ft_dataset_schema
         elif support_controlnet:
diff --git a/library/custom_train_functions.py b/library/custom_train_functions.py
index a56474622..406e0e36e 100644
--- a/library/custom_train_functions.py
+++ b/library/custom_train_functions.py
@@ -3,11 +3,14 @@
 import random
 import re
 from typing import List, Optional, Union
-from .utils import setup_logging
+from .utils import setup_logging
+
 setup_logging()
-import logging
+import logging
+
 logger = logging.getLogger(__name__)
 
+
 def prepare_scheduler_for_custom_training(noise_scheduler, device):
     if hasattr(noise_scheduler, "all_snr"):
         return
@@ -64,7 +67,7 @@ def apply_snr_weight(loss, timesteps, noise_scheduler, gamma, v_prediction=False
     snr = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])
     min_snr_gamma = torch.minimum(snr, torch.full_like(snr, gamma))
     if v_prediction:
-        snr_weight = torch.div(min_snr_gamma, snr+1).float().to(loss.device)
+        snr_weight = torch.div(min_snr_gamma, snr + 1).float().to(loss.device)
     else:
         snr_weight = torch.div(min_snr_gamma, snr).float().to(loss.device)
     loss = loss * snr_weight
@@ -92,13 +95,15 @@ def add_v_prediction_like_loss(loss, timesteps, noise_scheduler, v_pred_like_los
     loss = loss + loss / scale * v_pred_like_loss
     return loss
 
+
 def apply_debiased_estimation(loss, timesteps, noise_scheduler):
     snr_t = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])  # batch_size
     snr_t = torch.minimum(snr_t, torch.ones_like(snr_t) * 1000)  # if timestep is 0, snr_t is inf, so limit it to 1000
-    weight = 1/torch.sqrt(snr_t)
+    weight = 1 / torch.sqrt(snr_t)
     loss = weight * loss
     return loss
 
+
 # TODO train_utilと分散しているのでどちらかに寄せる
 
 
@@ -474,6 +479,17 @@ def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale):
     return noise
 
 
+def apply_masked_loss(loss, batch):
+    # mask image is -1 to 1. we need to convert it to 0 to 1
+    mask_image = batch["conditioning_images"].to(dtype=loss.dtype)[:, 0].unsqueeze(1)  # use R channel
+
+    # resize to the same size as the loss
+    mask_image = torch.nn.functional.interpolate(mask_image, size=loss.shape[2:], mode="area")
+    mask_image = mask_image / 2 + 0.5
+    loss = loss * mask_image
+    return loss
+
+
 """
 ##########################################
 # Perlin Noise
diff --git a/library/train_util.py b/library/train_util.py
index 0f8cf9eea..1d9f8bf82 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3028,6 +3028,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         "--full_bf16", action="store_true", help="bf16 training including gradients / 勾配も含めてbf16で学習する"
     )  # TODO move to SDXL training, because it is not supported by SD1/2
     parser.add_argument("--fp8_base", action="store_true", help="use fp8 for base model / base modelにfp8を使う")
+
     parser.add_argument(
         "--ddp_timeout",
         type=int,
@@ -3090,6 +3091,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         default=None,
         help="specify WandB API key to log in before starting training (optional). / WandB APIキーを指定して学習開始前にログインする（オプション）",
     )
+
     parser.add_argument(
         "--noise_offset",
         type=float,
@@ -3252,6 +3254,20 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         )
 
 
+def add_masked_loss_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--conditioning_data_dir",
+        type=str,
+        default=None,
+        help="conditioning data directory / 条件付けデータのディレクトリ",
+    )
+    parser.add_argument(
+        "--masked_loss",
+        action="store_true",
+        help="apply mask for calculating loss. conditioning_data_dir is required for dataset. / 損失計算時にマスクを適用する。datasetにはconditioning_data_dirが必要",
+    )
+
+
 def verify_training_args(args: argparse.Namespace):
     r"""
     Verify training arguments. Also reflect highvram option to global variable
diff --git a/sdxl_train.py b/sdxl_train.py
index 448a160f6..f8aa46081 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -40,6 +40,7 @@
     scale_v_prediction_loss_like_noise_prediction,
     add_v_prediction_like_loss,
     apply_debiased_estimation,
+    apply_masked_loss,
 )
 from library.sdxl_original_unet import SdxlUNet2DConditionModel
 
@@ -577,19 +578,12 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                     or args.scale_v_pred_loss_like_noise_pred
                     or args.v_pred_like_loss
                     or args.debiased_estimation_loss
+                    or args.masked_loss
                 ):
                     # do not mean over batch dimension for snr weight or scale v-pred loss
                     loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
-
                     if args.masked_loss:
-                        # mask image is -1 to 1. we need to convert it to 0 to 1
-                        mask_image = batch["conditioning_images"].to(dtype=weight_dtype)[:, 0].unsqueeze(1)  # use R channel
-
-                        # resize to the same size as the loss
-                        mask_image = torch.nn.functional.interpolate(mask_image, size=loss.shape[2:], mode="area")
-                        mask_image = mask_image / 2 + 0.5
-                        loss = loss * mask_image
-
+                        loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
 
                     if args.min_snr_gamma:
@@ -755,6 +749,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_training_arguments(parser, False)
+    train_util.add_masked_loss_arguments(parser)
     train_util.add_sd_saving_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
@@ -790,14 +785,6 @@ def setup_parser() -> argparse.ArgumentParser:
         help=f"learning rates for each block of U-Net, comma-separated, {UNET_NUM_BLOCKS_FOR_BLOCK_LR} values / "
         + f"U-Netの各ブロックの学習率、カンマ区切り、{UNET_NUM_BLOCKS_FOR_BLOCK_LR}個の値",
     )
-
-    # TODO common masked_loss argument
-    parser.add_argument(
-        "--masked_loss",
-        action="store_true",
-        help="apply mask for calculating loss. conditioning_data_dir is required for dataset. / 損失計算時にマスクを適用する。datasetにはconditioning_data_dirが必要",
-    )
-
     return parser
 
 
diff --git a/train_db.py b/train_db.py
index 8d36097a5..213df1516 100644
--- a/train_db.py
+++ b/train_db.py
@@ -12,6 +12,7 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from accelerate.utils import set_seed
@@ -32,6 +33,7 @@
     apply_noise_offset,
     scale_v_prediction_loss_like_noise_prediction,
     apply_debiased_estimation,
+    apply_masked_loss,
 )
 from library.utils import setup_logging, add_logging_arguments
 
@@ -57,7 +59,7 @@ def train(args):
 
     # データセットを準備する
     if args.dataset_class is None:
-        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, False, False, True))
+        blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, False, args.masked_loss, True))
         if args.dataset_config is not None:
             logger.info(f"Load dataset config from {args.dataset_config}")
             user_config = config_util.load_user_config(args.dataset_config)
@@ -339,6 +341,8 @@ def train(args):
                     target = noise
 
                 loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                if args.masked_loss:
+                    loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
@@ -464,6 +468,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, True, False, True)
     train_util.add_training_arguments(parser, True)
+    train_util.add_masked_loss_arguments(parser)
     train_util.add_sd_saving_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
diff --git a/train_network.py b/train_network.py
index f5617986c..05522070b 100644
--- a/train_network.py
+++ b/train_network.py
@@ -40,6 +40,7 @@
     scale_v_prediction_loss_like_noise_prediction,
     add_v_prediction_like_loss,
     apply_debiased_estimation,
+    apply_masked_loss,
 )
 from library.utils import setup_logging, add_logging_arguments
 
@@ -835,16 +836,8 @@ def remove_model(old_ckpt_name):
                         target = noise
 
                     loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
-
                     if args.masked_loss:
-                        # mask image is -1 to 1. we need to convert it to 0 to 1
-                        mask_image = batch["conditioning_images"].to(dtype=weight_dtype)[:, 0].unsqueeze(1)  # use R channel
-
-                        # resize to the same size as the loss
-                        mask_image = torch.nn.functional.interpolate(mask_image, size=loss.shape[2:], mode="area")
-                        mask_image = mask_image / 2 + 0.5
-                        loss = loss * mask_image
-
+                        loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
 
                     loss_weights = batch["loss_weights"]  # 各sampleごとのweight
@@ -968,6 +961,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, True)
     train_util.add_training_arguments(parser, True)
+    train_util.add_masked_loss_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser)
@@ -1061,11 +1055,6 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
     )
-    parser.add_argument(
-        "--masked_loss",
-        action="store_true",
-        help="apply mask for calculating loss. conditioning_data_dir is required for dataset. / 損失計算時にマスクを適用する。datasetにはconditioning_data_dirが必要",
-    )
     return parser
 
 
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index df1d8485a..7697b9672 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -8,6 +8,7 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from accelerate.utils import set_seed
@@ -29,6 +30,7 @@
     scale_v_prediction_loss_like_noise_prediction,
     add_v_prediction_like_loss,
     apply_debiased_estimation,
+    apply_masked_loss,
 )
 from library.utils import setup_logging, add_logging_arguments
 
@@ -268,7 +270,7 @@ def train(self, args):
 
         # データセットを準備する
         if args.dataset_class is None:
-            blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, False, False))
+            blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, False))
             if args.dataset_config is not None:
                 accelerator.print(f"Load dataset config from {args.dataset_config}")
                 user_config = config_util.load_user_config(args.dataset_config)
@@ -586,6 +588,8 @@ def remove_model(old_ckpt_name):
                         target = noise
 
                     loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                    if args.masked_loss:
+                        loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
 
                     loss_weights = batch["loss_weights"]  # 各sampleごとのweight
@@ -749,6 +753,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, False)
     train_util.add_training_arguments(parser, True)
+    train_util.add_masked_loss_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser, False)
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 695fad2a8..72b79da46 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -9,6 +9,7 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from accelerate.utils import set_seed
@@ -31,6 +32,7 @@
     apply_noise_offset,
     scale_v_prediction_loss_like_noise_prediction,
     apply_debiased_estimation,
+    apply_masked_loss,
 )
 import library.original_unet as original_unet
 from XTI_hijack import unet_forward_XTI, downblock_forward_XTI, upblock_forward_XTI
@@ -200,7 +202,7 @@ def train(args):
     logger.info(f"create embeddings for {args.num_vectors_per_token} tokens, for {args.token_string}")
 
     # データセットを準備する
-    blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, False, False))
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, False))
     if args.dataset_config is not None:
         logger.info(f"Load dataset config from {args.dataset_config}")
         user_config = config_util.load_user_config(args.dataset_config)
@@ -471,6 +473,8 @@ def remove_model(old_ckpt_name):
                     target = noise
 
                 loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                if args.masked_loss:
+                    loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
@@ -662,6 +666,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, False)
     train_util.add_training_arguments(parser, True)
+    train_util.add_masked_loss_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser, False)

From f9317052edb4ab3b3c531ac3b28825ee78b4a966 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 18 Mar 2024 08:53:23 +0900
Subject: [PATCH 030/132] update readme for timestep embs bug

---
 README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3639b7be8..001446b7c 100644
--- a/README.md
+++ b/README.md
@@ -252,12 +252,15 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 ### Mar 15, 2024 / 2024/3/15: v0.8.5
 
 - Fixed a bug that the value of timestep embedding during SDXL training was incorrect.
+  - Please update for SDXL training.
   - The inference with the generation script is also fixed.
-  - The impact is unknown, but please update for SDXL training.
+  - This fix appears to resolve an issue where unintended artifacts occurred in trained models under certain conditions. 
+We would like to express our deep gratitude to Mark Saint (cacoe) from leonardo.ai, for reporting the issue and cooperating with the verification, and to gcem156 for the advice provided in identifying the part of the code that needed to be fixed.
 
 - SDXL 学習時の timestep embedding の値が誤っていたのを修正しました。
+  - SDXL の学習時にはアップデートをお願いいたします。
   - 生成スクリプトでの推論時についてもあわせて修正しました。
-  - 影響の度合いは不明ですが、SDXL の学習時にはアップデートをお願いいたします。
+  - この修正により、特定の条件下で学習されたモデルに意図しないアーティファクトが発生する問題が解消されるようです。問題を報告いただき、また検証にご協力いただいた leonardo.ai の Mark Saint (cacoe) 氏、および修正点の特定に関するアドバイスをいただいた gcem156 氏に深く感謝いたします。
 
 ### Feb 24, 2024 / 2024/2/24: v0.8.4
 

From a7dff592d34a5dd9d306de822db70f0028676cab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <qinglongshengzhe@gmail.com>
Date: Mon, 18 Mar 2024 22:29:05 +0800
Subject: [PATCH 031/132] Update tag_images_by_wd14_tagger.py

add WDV3
---
 finetune/tag_images_by_wd14_tagger.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index b56d921a3..e63ec3eb4 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -86,23 +86,26 @@ def main(args):
         logger.info(f"downloading wd14 tagger model from hf_hub. id: {args.repo_id}")
         files = FILES
         if args.onnx:
+            files = ["selected_tags.csv"]
             files += FILES_ONNX
+        else:
+            for file in SUB_DIR_FILES:
+                hf_hub_download(
+                    args.repo_id,
+                    file,
+                    subfolder=SUB_DIR,
+                    cache_dir=os.path.join(args.model_dir, SUB_DIR),
+                    force_download=True,
+                    force_filename=file,
+                )
         for file in files:
             hf_hub_download(args.repo_id, file, cache_dir=args.model_dir, force_download=True, force_filename=file)
-        for file in SUB_DIR_FILES:
-            hf_hub_download(
-                args.repo_id,
-                file,
-                subfolder=SUB_DIR,
-                cache_dir=os.path.join(args.model_dir, SUB_DIR),
-                force_download=True,
-                force_filename=file,
-            )
     else:
         logger.info("using existing wd14 tagger model")
 
     # 画像を読み込む
     if args.onnx:
+        import torch
         import onnx
         import onnxruntime as ort
 

From 5410a8c79b23c594bb340050b4a81e30d95cd7be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <qinglongshengzhe@gmail.com>
Date: Mon, 18 Mar 2024 22:31:00 +0800
Subject: [PATCH 032/132] Update requirements.txt

---
 requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 279de350c..326b65b3e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,9 +22,9 @@ huggingface-hub==0.20.1
 # for WD14 captioning (tensorflow)
 # tensorflow==2.10.1
 # for WD14 captioning (onnx)
-# onnx==1.14.1
-# onnxruntime-gpu==1.16.0
-# onnxruntime==1.16.0
+# onnx==1.15.1
+# onnxruntime-gpu==1.17.1
+# onnxruntime==1.17.1
 # this is for onnx: 
 # protobuf==3.20.3
 # open clip for SDXL

From a71c35ccd9c813821fcbd3f0e00d71fb5e6d91d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <qinglongshengzhe@gmail.com>
Date: Mon, 18 Mar 2024 22:31:59 +0800
Subject: [PATCH 033/132] Update requirements.txt

---
 requirements.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 326b65b3e..6898eccf6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,6 +25,9 @@ huggingface-hub==0.20.1
 # onnx==1.15.1
 # onnxruntime-gpu==1.17.1
 # onnxruntime==1.17.1
+# for cuda 12.1(default 11.8)
+# onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
+
 # this is for onnx: 
 # protobuf==3.20.3
 # open clip for SDXL

From 6c51c971d135a346d2f9081760f138b1c6515e9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <qinglongshengzhe@gmail.com>
Date: Wed, 20 Mar 2024 09:35:21 +0800
Subject: [PATCH 034/132] fix typo

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6898eccf6..805f0501d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,7 +22,7 @@ huggingface-hub==0.20.1
 # for WD14 captioning (tensorflow)
 # tensorflow==2.10.1
 # for WD14 captioning (onnx)
-# onnx==1.15.1
+# onnx==1.15.0
 # onnxruntime-gpu==1.17.1
 # onnxruntime==1.17.1
 # for cuda 12.1(default 11.8)

From 80dbbf5e4875f56ff1e0d8aacea4e73b96a14b63 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 20 Mar 2024 16:14:57 +0900
Subject: [PATCH 035/132] tagger now stores model under repo_id subdir

---
 README.md                             |  9 ++++-
 finetune/tag_images_by_wd14_tagger.py | 55 ++++++++++++++++++---------
 2 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index f0cad611f..d03204037 100644
--- a/README.md
+++ b/README.md
@@ -260,7 +260,9 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
   - `keep_tokens_separator` is updated to be used twice in the caption. When you specify `keep_tokens_separator="|||"`, the part divided by the second `|||` is not shuffled or dropped and remains at the end.
   - The existing features `caption_prefix` and `caption_suffix` can be used together. `caption_prefix` and `caption_suffix` are processed first, and then `enable_wildcard`, `keep_tokens_separator`, shuffling and dropping, and `secondary_separator` are processed in order.
   - The examples are [shown below](#example-of-dataset-settings--データセット設定の記述例).
-
+- The support for v3 repositories is added to `tag_image_by_wd14_tagger.py` (`--onnx` option only). PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) Thanks to sdbds!
+  - Onnx may need to be updated. Onnx is not installed by default, so please install or update it with `pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` etc. Please also check the comments in `requirements.txt`.
+- The model is now saved in the subdirectory as `--repo_id` in `tag_image_by_wd14_tagger.py` . This caches multiple repo_id models. Please delete unnecessary files under `--model_dir`.
 
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
@@ -269,6 +271,11 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
   - `enable_wildcard` を追加しました。`true` にするとワイルドカード記法 `{aaa|bbb|ccc}` が使えます。詳しくは記述例をご覧ください。
   - `keep_tokens_separator` をキャプション内に 2 つ使えるようにしました。たとえば `keep_tokens_separator="|||"` と指定したとき、`1girl, hatsune miku, vocaloid ||| stage, mic ||| best quality, rating: general` とキャプションを指定すると、二番目の `|||` で分割された部分はシャッフル、drop されず末尾に残ります。
   - 既存の機能 `caption_prefix` と `caption_suffix` とあわせて使えます。`caption_prefix` と `caption_suffix` は一番最初に処理され、その後、ワイルドカード、`keep_tokens_separator`、シャッフルおよび drop、`secondary_separator` の順に処理されます。
+- `tag_image_by_wd14_tagger.py` で v3 のリポジトリがサポートされました（`--onnx` 指定時のみ有効）。 PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) sdbds 氏に感謝します。
+  - Onnx のバージョンアップが必要になるかもしれません。デフォルトでは Onnx はインストールされていませんので、`pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` 等でインストール、アップデートしてください。`requirements.txt` のコメントもあわせてご確認ください。
+- `tag_image_by_wd14_tagger.py` で、モデルを`--repo_id` のサブディレクトリに保存するようにしました。これにより複数のモデルファイルがキャッシュされます。`--model_dir` 直下の不要なファイルは削除願います。
+
+
 
 #### Example of dataset settings / データセット設定の記述例:
 
diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index e63ec3eb4..401c6d1ec 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -12,8 +12,10 @@
 
 import library.train_util as train_util
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 # from wd14 tagger
@@ -79,10 +81,15 @@ def collate_fn_remove_corrupted(batch):
 
 
 def main(args):
+    # model location is model_dir + repo_id
+    # repo id may be like "user/repo" or "user/repo/branch", so we need to remove slash
+    model_location = os.path.join(args.model_dir, args.repo_id.replace("/", "_"))
+
     # hf_hub_downloadをそのまま使うとsymlink関係で問題があるらしいので、キャッシュディレクトリとforce_filenameを指定してなんとかする
     # depreacatedの警告が出るけどなくなったらその時
     # https://github.com/toriato/stable-diffusion-webui-wd14-tagger/issues/22
-    if not os.path.exists(args.model_dir) or args.force_download:
+    if not os.path.exists(model_location) or args.force_download:
+        os.makedirs(args.model_dir, exist_ok=True)
         logger.info(f"downloading wd14 tagger model from hf_hub. id: {args.repo_id}")
         files = FILES
         if args.onnx:
@@ -94,12 +101,12 @@ def main(args):
                     args.repo_id,
                     file,
                     subfolder=SUB_DIR,
-                    cache_dir=os.path.join(args.model_dir, SUB_DIR),
+                    cache_dir=os.path.join(model_location, SUB_DIR),
                     force_download=True,
                     force_filename=file,
                 )
         for file in files:
-            hf_hub_download(args.repo_id, file, cache_dir=args.model_dir, force_download=True, force_filename=file)
+            hf_hub_download(args.repo_id, file, cache_dir=model_location, force_download=True, force_filename=file)
     else:
         logger.info("using existing wd14 tagger model")
 
@@ -109,7 +116,7 @@ def main(args):
         import onnx
         import onnxruntime as ort
 
-        onnx_path = f"{args.model_dir}/model.onnx"
+        onnx_path = f"{model_location}/model.onnx"
         logger.info("Running wd14 tagger with onnx")
         logger.info(f"loading onnx model: {onnx_path}")
 
@@ -126,7 +133,7 @@ def main(args):
         except:
             batch_size = model.graph.input[0].type.tensor_type.shape.dim[0].dim_param
 
-        if args.batch_size != batch_size and type(batch_size) != str:
+        if args.batch_size != batch_size and type(batch_size) != str and batch_size > 0:
             # some rebatch model may use 'N' as dynamic axes
             logger.warning(
                 f"Batch size {args.batch_size} doesn't match onnx model batch size {batch_size}, use model batch size {batch_size}"
@@ -137,19 +144,19 @@ def main(args):
 
         ort_sess = ort.InferenceSession(
             onnx_path,
-            providers=["CUDAExecutionProvider"]
-            if "CUDAExecutionProvider" in ort.get_available_providers()
-            else ["CPUExecutionProvider"],
+            providers=(
+                ["CUDAExecutionProvider"] if "CUDAExecutionProvider" in ort.get_available_providers() else ["CPUExecutionProvider"]
+            ),
         )
     else:
         from tensorflow.keras.models import load_model
 
-        model = load_model(f"{args.model_dir}")
+        model = load_model(f"{model_location}")
 
     # label_names = pd.read_csv("2022_0000_0899_6549/selected_tags.csv")
     # 依存ライブラリを増やしたくないので自力で読むよ
 
-    with open(os.path.join(args.model_dir, CSV_FILE), "r", encoding="utf-8") as f:
+    with open(os.path.join(model_location, CSV_FILE), "r", encoding="utf-8") as f:
         reader = csv.reader(f)
         l = [row for row in reader]
         header = l[0]  # tag_id,name,category,count
@@ -175,8 +182,8 @@ def run_batch(path_imgs):
         imgs = np.array([im for _, im in path_imgs])
 
         if args.onnx:
-            if len(imgs) < args.batch_size:
-                imgs = np.concatenate([imgs, np.zeros((args.batch_size - len(imgs), IMAGE_SIZE, IMAGE_SIZE, 3))], axis=0)
+            # if len(imgs) < args.batch_size:
+            #     imgs = np.concatenate([imgs, np.zeros((args.batch_size - len(imgs), IMAGE_SIZE, IMAGE_SIZE, 3))], axis=0)
             probs = ort_sess.run(None, {input_name: imgs})[0]  # onnx output numpy
             probs = probs[: len(path_imgs)]
         else:
@@ -317,7 +324,9 @@ def setup_parser() -> argparse.ArgumentParser:
         help="directory to store wd14 tagger model / wd14 taggerのモデルを格納するディレクトリ",
     )
     parser.add_argument(
-        "--force_download", action="store_true", help="force downloading wd14 tagger models / wd14 taggerのモデルを再ダウンロードします"
+        "--force_download",
+        action="store_true",
+        help="force downloading wd14 tagger models / wd14 taggerのモデルを再ダウンロードします",
     )
     parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
     parser.add_argument(
@@ -332,8 +341,12 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="extension of caption file (for backward compatibility) / 出力されるキャプションファイルの拡張子（スペルミスしていたのを残してあります）",
     )
-    parser.add_argument("--caption_extension", type=str, default=".txt", help="extension of caption file / 出力されるキャプションファイルの拡張子")
-    parser.add_argument("--thresh", type=float, default=0.35, help="threshold of confidence to add a tag / タグを追加するか判定する閾値")
+    parser.add_argument(
+        "--caption_extension", type=str, default=".txt", help="extension of caption file / 出力されるキャプションファイルの拡張子"
+    )
+    parser.add_argument(
+        "--thresh", type=float, default=0.35, help="threshold of confidence to add a tag / タグを追加するか判定する閾値"
+    )
     parser.add_argument(
         "--general_threshold",
         type=float,
@@ -346,7 +359,9 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="threshold of confidence to add a tag for character category, same as --thres if omitted / characterカテゴリのタグを追加するための確信度の閾値、省略時は --thresh と同じ",
     )
-    parser.add_argument("--recursive", action="store_true", help="search for images in subfolders recursively / サブフォルダを再帰的に検索する")
+    parser.add_argument(
+        "--recursive", action="store_true", help="search for images in subfolders recursively / サブフォルダを再帰的に検索する"
+    )
     parser.add_argument(
         "--remove_underscore",
         action="store_true",
@@ -359,9 +374,13 @@ def setup_parser() -> argparse.ArgumentParser:
         default="",
         help="comma-separated list of undesired tags to remove from the output / 出力から除外したいタグのカンマ区切りのリスト",
     )
-    parser.add_argument("--frequency_tags", action="store_true", help="Show frequency of tags for images / 画像ごとのタグの出現頻度を表示する")
+    parser.add_argument(
+        "--frequency_tags", action="store_true", help="Show frequency of tags for images / 画像ごとのタグの出現頻度を表示する"
+    )
     parser.add_argument("--onnx", action="store_true", help="use onnx model for inference / onnxモデルを推論に使用する")
-    parser.add_argument("--append_tags", action="store_true", help="Append captions instead of overwriting / 上書きではなくキャプションを追記する")
+    parser.add_argument(
+        "--append_tags", action="store_true", help="Append captions instead of overwriting / 上書きではなくキャプションを追記する"
+    )
     parser.add_argument(
         "--caption_separator",
         type=str,

From 46331a9e8ef695ea0b5a19686202d011109a56b6 Mon Sep 17 00:00:00 2001
From: Victor Espinoza-Guerra <thesilvervelvets@gmail.com>
Date: Wed, 20 Mar 2024 00:31:01 -0700
Subject: [PATCH 036/132] English Translation of config_README-ja.md (#1175)

* Add files via upload

Creating template to work on.

* Update config_README-en.md

Total Conversion from Japanese to English.

* Update config_README-en.md

* Update config_README-en.md

* Update config_README-en.md
---
 docs/config_README-en.md | 279 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 279 insertions(+)
 create mode 100644 docs/config_README-en.md

diff --git a/docs/config_README-en.md b/docs/config_README-en.md
new file mode 100644
index 000000000..a0727934d
--- /dev/null
+++ b/docs/config_README-en.md
@@ -0,0 +1,279 @@
+Original Source by kohya-ss
+
+A.I Translation by Model: NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO, editing by Darkstorm2150
+
+# Config Readme
+
+This README is about the configuration files that can be passed with the `--dataset_config` option.
+
+## Overview
+
+By passing a configuration file, users can make detailed settings.
+
+* Multiple datasets can be configured
+   * For example, by setting `resolution` for each dataset, they can be mixed and trained.
+   * In training methods that support both the DreamBooth approach and the fine-tuning approach, datasets of the DreamBooth method and the fine-tuning method can be mixed.
+* Settings can be changed for each subset
+   * A subset is a partition of the dataset by image directory or metadata. Several subsets make up a dataset.
+   * Options such as `keep_tokens` and `flip_aug` can be set for each subset. On the other hand, options such as `resolution` and `batch_size` can be set for each dataset, and their values are common among subsets belonging to the same dataset. More details will be provided later.
+
+The configuration file format can be JSON or TOML. Considering the ease of writing, it is recommended to use [TOML](https://toml.io/ja/v1.0.0-rc.2). The following explanation assumes the use of TOML.
+
+
+Here is an example of a configuration file written in TOML.
+
+```toml
+[general]
+shuffle_caption = true
+caption_extension = '.txt'
+keep_tokens = 1
+
+# This is a DreamBooth-style dataset
+[[datasets]]
+resolution = 512
+batch_size = 4
+keep_tokens = 2
+
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+  class_tokens = 'hoge girl'
+  # This subset uses keep_tokens = 2 (the value of the parent datasets)
+
+  [[datasets.subsets]]
+  image_dir = 'C:\fuga'
+  class_tokens = 'fuga boy'
+  keep_tokens = 3
+
+  [[datasets.subsets]]
+  is_reg = true
+  image_dir = 'C:\reg'
+  class_tokens = 'human'
+  keep_tokens = 1
+
+# This is a fine-tuning dataset
+[[datasets]]
+resolution = [768, 768]
+batch_size = 2
+
+  [[datasets.subsets]]
+  image_dir = 'C:\piyo'
+  metadata_file = 'C:\piyo\piyo_md.json'
+  # This subset uses keep_tokens = 1 (the value of [general])
+```
+
+In this example, three directories are trained as a DreamBooth-style dataset at 512x512 (batch size 4), and one directory is trained as a fine-tuning dataset at 768x768 (batch size 2).
+
+## Settings for datasets and subsets
+
+Settings for datasets and subsets are divided into several registration locations.
+
+* `[general]`
+    * This is where options that apply to all datasets or all subsets are specified.
+    * If there are options with the same name in the dataset-specific or subset-specific settings, the dataset-specific or subset-specific settings take precedence.
+* `[[datasets]]`
+    * `datasets` is where settings for datasets are registered. This is where options that apply individually to each dataset are specified.
+	* If there are subset-specific settings, the subset-specific settings take precedence.
+* `[[datasets.subsets]]`
+    * `datasets.subsets` is where settings for subsets are registered. This is where options that apply individually to each subset are specified.
+
+Here is an image showing the correspondence between image directories and registration locations in the previous example.
+
+```
+C:\
+├─ hoge  ->  [[datasets.subsets]] No.1  ┐                        ┐
+├─ fuga  ->  [[datasets.subsets]] No.2  |->  [[datasets]] No.1   |->  [general]
+├─ reg   ->  [[datasets.subsets]] No.3  ┘                        |
+└─ piyo  ->  [[datasets.subsets]] No.4  -->  [[datasets]] No.2   ┘
+```
+
+The image directory corresponds to each `[[datasets.subsets]]`. Then, multiple `[[datasets.subsets]]` are combined to form one `[[datasets]]`. All `[[datasets]]` and `[[datasets.subsets]]` belong to `[general]`.
+
+The available options for each registration location may differ, but if the same option is specified, the value in the lower registration location will take precedence. You can check how the `keep_tokens` option is handled in the previous example for better understanding.
+
+Additionally, the available options may vary depending on the method that the learning approach supports.
+
+* Options specific to the DreamBooth method
+* Options specific to the fine-tuning method
+* Options available when using the caption dropout technique
+
+When using both the DreamBooth method and the fine-tuning method, they can be used together with a learning approach that supports both.
+When using them together, a point to note is that the method is determined based on the dataset, so it is not possible to mix DreamBooth method subsets and fine-tuning method subsets within the same dataset.
+In other words, if you want to use both methods together, you need to set up subsets of different methods belonging to different datasets.
+
+In terms of program behavior, if the `metadata_file` option exists, it is determined to be a subset of fine-tuning. Therefore, for subsets belonging to the same dataset, as long as they are either "all have the `metadata_file` option" or "all have no `metadata_file` option," there is no problem.
+
+Below, the available options will be explained. For options with the same name as the command-line argument, the explanation will be omitted in principle. Please refer to other READMEs.
+
+### Common options for all learning methods
+
+These are options that can be specified regardless of the learning method.
+
+#### Data set specific options
+
+These are options related to the configuration of the data set. They cannot be described in `datasets.subsets`.
+
+
+| Option Name | Example Setting | `[general]` | `[[datasets]]` |
+| ---- | ---- | ---- | ---- |
+| `batch_size` | `1` | o | o |
+| `bucket_no_upscale` | `true` | o | o |
+| `bucket_reso_steps` | `64` | o | o |
+| `enable_bucket` | `true` | o | o |
+| `max_bucket_reso` | `1024` | o | o |
+| `min_bucket_reso` | `128` | o | o |
+| `resolution` | `256`, `[512, 512]` | o | o |
+
+* `batch_size`
+    * This corresponds to the command-line argument `--train_batch_size`.
+
+These settings are fixed per dataset. That means that subsets belonging to the same dataset will share these settings. For example, if you want to prepare datasets with different resolutions, you can define them as separate datasets as shown in the example above, and set different resolutions for each.
+
+#### Options for Subsets
+
+These options are related to subset configuration.
+
+| Option Name | Example | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `color_aug` | `false` | o | o | o |
+| `face_crop_aug_range` | `[1.0, 3.0]` | o | o | o |
+| `flip_aug` | `true` | o | o | o |
+| `keep_tokens` | `2` | o | o | o |
+| `num_repeats` | `10` | o | o | o |
+| `random_crop` | `false` | o | o | o |
+| `shuffle_caption` | `true` | o | o | o |
+| `caption_prefix` | `"masterpiece, best quality, "` | o | o | o |
+| `caption_suffix` | `", from side"` | o | o | o |
+
+* `num_repeats`
+    * Specifies the number of repeats for images in a subset. This is equivalent to `--dataset_repeats` in fine-tuning but can be specified for any training method.
+* `caption_prefix`, `caption_suffix`
+    * Specifies the prefix and suffix strings to be appended to the captions. Shuffling is performed with these strings included. Be cautious when using `keep_tokens`.
+
+### DreamBooth-specific options
+
+DreamBooth-specific options only exist as subsets-specific options.
+
+#### Subset-specific options
+
+Options related to the configuration of DreamBooth subsets.
+
+| Option Name | Example Setting | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `image_dir` | `'C:\hoge'` | - | - | o (required) |
+| `caption_extension` | `".txt"` | o | o | o |
+| `class_tokens` | `"sks girl"` | - | - | o |
+| `is_reg` | `false` | - | - | o |
+
+Firstly, note that for `image_dir`, the path to the image files must be specified as being directly in the directory. Unlike the previous DreamBooth method, where images had to be placed in subdirectories, this is not compatible with that specification. Also, even if you name the folder something like "5_cat", the number of repeats of the image and the class name will not be reflected. If you want to set these individually, you will need to explicitly specify them using `num_repeats` and `class_tokens`.
+
+* `image_dir`
+    * Specifies the path to the image directory. This is a required option.
+    * Images must be placed directly under the directory.
+* `class_tokens`
+    * Sets the class tokens.
+    * Only used during training when a corresponding caption file does not exist. The determination of whether or not to use it is made on a per-image basis. If `class_tokens` is not specified and a caption file is not found, an error will occur.
+* `is_reg`
+    * Specifies whether the subset images are for normalization. If not specified, it is set to `false`, meaning that the images are not for normalization.
+
+### Fine-tuning method specific options
+
+The options for the fine-tuning method only exist for subset-specific options.
+
+#### Subset-specific options
+
+These options are related to the configuration of the fine-tuning method's subsets.
+
+| Option name | Example setting | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `image_dir` | `'C:\hoge'` | - | - | o |
+| `metadata_file` | `'C:\piyo\piyo_md.json'` | - | - | o (required) |
+
+* `image_dir`
+    * Specify the path to the image directory. Unlike the DreamBooth method, specifying it is not mandatory, but it is recommended to do so.
+        * The case where it is not necessary to specify is when the `--full_path` is added to the command line when generating the metadata file.
+    * The images must be placed directly under the directory.
+* `metadata_file`
+    * Specify the path to the metadata file used for the subset. This is a required option.
+        * It is equivalent to the command-line argument `--in_json`.
+    * Due to the specification that a metadata file must be specified for each subset, it is recommended to avoid creating a metadata file with images from different directories as a single metadata file. It is strongly recommended to prepare a separate metadata file for each image directory and register them as separate subsets.
+
+### Options available when caption dropout method can be used
+
+The options available when the caption dropout method can be used exist only for subsets. Regardless of whether it's the DreamBooth method or fine-tuning method, if it supports caption dropout, it can be specified.
+
+#### Subset-specific options
+
+Options related to the setting of subsets that caption dropout can be used for.
+
+| Option Name | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- |
+| `caption_dropout_every_n_epochs` | o | o | o |
+| `caption_dropout_rate` | o | o | o |
+| `caption_tag_dropout_rate` | o | o | o |
+
+## Behavior when there are duplicate subsets
+
+In the case of the DreamBooth dataset, if there are multiple `image_dir` directories with the same content, they are considered to be duplicate subsets. For the fine-tuning dataset, if there are multiple `metadata_file` files with the same content, they are considered to be duplicate subsets. If duplicate subsets exist in the dataset, subsequent subsets will be ignored.
+
+However, if they belong to different datasets, they are not considered duplicates. For example, if you have subsets with the same `image_dir` in different datasets, they will not be considered duplicates. This is useful when you want to train with the same image but with different resolutions.
+
+```toml
+# If data sets exist separately, they are not considered duplicates and are both used for training.
+
+[[datasets]]
+resolution = 512
+
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+
+[[datasets]]
+resolution = 768
+
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+```
+
+## Command Line Argument and Configuration File
+
+There are options in the configuration file that have overlapping roles with command line argument options.
+
+The following command line argument options are ignored if a configuration file is passed:
+
+* `--train_data_dir`
+* `--reg_data_dir`
+* `--in_json`
+
+The following command line argument options are given priority over the configuration file options if both are specified simultaneously. In most cases, they have the same names as the corresponding options in the configuration file.
+
+| Command Line Argument Option   | Prioritized Configuration File Option |
+| ------------------------------- | ------------------------------------- |
+| `--bucket_no_upscale`           |                                       |
+| `--bucket_reso_steps`           |                                       |
+| `--caption_dropout_every_n_epochs` |                                       |
+| `--caption_dropout_rate`        |                                       |
+| `--caption_extension`           |                                       |
+| `--caption_tag_dropout_rate`    |                                       |
+| `--color_aug`                   |                                       |
+| `--dataset_repeats`             | `num_repeats`                          |
+| `--enable_bucket`               |                                       |
+| `--face_crop_aug_range`         |                                       |
+| `--flip_aug`                    |                                       |
+| `--keep_tokens`                 |                                       |
+| `--min_bucket_reso`              |                                       |
+| `--random_crop`                 |                                       |
+| `--resolution`                  |                                       |
+| `--shuffle_caption`             |                                       |
+| `--train_batch_size`            | `batch_size`                           |
+
+## Error Guide
+
+Currently, we are using an external library to check if the configuration file is written correctly, but the development has not been completed, and there is a problem that the error message is not clear. In the future, we plan to improve this problem.
+
+As a temporary measure, we will list common errors and their solutions. If you encounter an error even though it should be correct or if the error content is not understandable, please contact us as it may be a bug.
+
+* `voluptuous.error.MultipleInvalid: required key not provided @ ...`: This error occurs when a required option is not provided. It is highly likely that you forgot to specify the option or misspelled the option name.
+  * The error location is indicated by `...` in the error message. For example, if you encounter an error like `voluptuous.error.MultipleInvalid: required key not provided @ data['datasets'][0]['subsets'][0]['image_dir']`, it means that the `image_dir` option does not exist in the 0th `subsets` of the 0th `datasets` setting.
+* `voluptuous.error.MultipleInvalid: expected int for dictionary value @ ...`: This error occurs when the specified value format is incorrect. It is highly likely that the value format is incorrect. The `int` part changes depending on the target option. The example configurations in this README may be helpful.
+* `voluptuous.error.MultipleInvalid: extra keys not allowed @ ...`: This error occurs when there is an option name that is not supported. It is highly likely that you misspelled the option name or mistakenly included it.
+
+

From 5f6196e4c71763250da316cc0f4ce15db1696017 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 20 Mar 2024 16:35:23 +0900
Subject: [PATCH 037/132] update readme

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d03204037..f8601c1b2 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,7 @@ Most of the documents are written in Japanese.
 * [Training guide - common](./docs/train_README-ja.md) : data preparation, options etc... 
   * [Chinese version](./docs/train_README-zh.md)
 * [Dataset config](./docs/config_README-ja.md) 
+  * [English version](./docs/config_README-en.md)
 * [DreamBooth training guide](./docs/train_db_README-ja.md)
 * [Step by Step fine-tuning guide](./docs/fine_tune_README_ja.md):
 * [training LoRA](./docs/train_network_README-ja.md)
@@ -263,6 +264,8 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 - The support for v3 repositories is added to `tag_image_by_wd14_tagger.py` (`--onnx` option only). PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) Thanks to sdbds!
   - Onnx may need to be updated. Onnx is not installed by default, so please install or update it with `pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` etc. Please also check the comments in `requirements.txt`.
 - The model is now saved in the subdirectory as `--repo_id` in `tag_image_by_wd14_tagger.py` . This caches multiple repo_id models. Please delete unnecessary files under `--model_dir`.
+- The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
+- The [English version of the dataset settings documentation](./docs/config_README-en.md) is added. PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) Thanks to darkstorm2150!
 
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
@@ -274,7 +277,8 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 - `tag_image_by_wd14_tagger.py` で v3 のリポジトリがサポートされました（`--onnx` 指定時のみ有効）。 PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) sdbds 氏に感謝します。
   - Onnx のバージョンアップが必要になるかもしれません。デフォルトでは Onnx はインストールされていませんので、`pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` 等でインストール、アップデートしてください。`requirements.txt` のコメントもあわせてご確認ください。
 - `tag_image_by_wd14_tagger.py` で、モデルを`--repo_id` のサブディレクトリに保存するようにしました。これにより複数のモデルファイルがキャッシュされます。`--model_dir` 直下の不要なファイルは削除願います。
-
+- 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
+- データセット設定の[英語版ドキュメント](./docs/config_README-en.md) が追加されました。PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) darkstorm2150 氏に感謝します。
 
 
 #### Example of dataset settings / データセット設定の記述例:

From 3b0db0f17f46148abe345c5cdce76ff707bdccd3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 20 Mar 2024 17:45:35 +0900
Subject: [PATCH 038/132] update readme

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f8601c1b2..81c176a79 100644
--- a/README.md
+++ b/README.md
@@ -266,6 +266,8 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 - The model is now saved in the subdirectory as `--repo_id` in `tag_image_by_wd14_tagger.py` . This caches multiple repo_id models. Please delete unnecessary files under `--model_dir`.
 - The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
 - The [English version of the dataset settings documentation](./docs/config_README-en.md) is added. PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) Thanks to darkstorm2150!
+- The `.toml` file for the dataset config is now read in UTF-8 encoding. PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Thanks to Horizon1704!
+
 
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
@@ -279,7 +281,7 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 - `tag_image_by_wd14_tagger.py` で、モデルを`--repo_id` のサブディレクトリに保存するようにしました。これにより複数のモデルファイルがキャッシュされます。`--model_dir` 直下の不要なファイルは削除願います。
 - 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
 - データセット設定の[英語版ドキュメント](./docs/config_README-en.md) が追加されました。PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) darkstorm2150 氏に感謝します。
-
+- データセット設定の `.toml` ファイルが UTF-8 encoding で読み込まれるようになりました。PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Horizon1704 氏に感謝します。
 
 #### Example of dataset settings / データセット設定の記述例:
 

From 855add067b06464eaa47ed55840da0f17d675762 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 20 Mar 2024 18:14:05 +0900
Subject: [PATCH 039/132] update option help and readme

---
 README.md             | 7 +++++--
 library/train_util.py | 8 ++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 81c176a79..804bad84a 100644
--- a/README.md
+++ b/README.md
@@ -253,6 +253,7 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 ### Working in progress
 
 - Colab seems to stop with log output. Try specifying `--console_log_simple` option in the training script to disable rich logging.
+- The `.toml` file for the dataset config is now read in UTF-8 encoding. PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Thanks to Horizon1704!
 - `train_network.py` and `sdxl_train_network.py` are modified to record some dataset settings in the metadata of the trained model (`caption_prefix`, `caption_suffix`, `keep_tokens_separator`, `secondary_separator`, `enable_wildcard`).
 - Some features are added to the dataset subset settings.
   - `secondary_separator` is added to specify the tag separator that is not the target of shuffling or dropping. 
@@ -266,10 +267,11 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 - The model is now saved in the subdirectory as `--repo_id` in `tag_image_by_wd14_tagger.py` . This caches multiple repo_id models. Please delete unnecessary files under `--model_dir`.
 - The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
 - The [English version of the dataset settings documentation](./docs/config_README-en.md) is added. PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) Thanks to darkstorm2150!
-- The `.toml` file for the dataset config is now read in UTF-8 encoding. PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Thanks to Horizon1704!
+- The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
 
 
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
+- データセット設定の `.toml` ファイルが UTF-8 encoding で読み込まれるようになりました。PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Horizon1704 氏に感謝します。
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
 - データセットのサブセット設定にいくつかの機能を追加しました。
   - シャッフルの対象とならないタグ分割識別子の指定 `secondary_separator` を追加しました。`secondary_separator=";;;"` のように指定します。`secondary_separator` で区切ることで、その部分はシャッフル、drop 時にまとめて扱われます。詳しくは記述例をご覧ください。
@@ -281,7 +283,8 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 - `tag_image_by_wd14_tagger.py` で、モデルを`--repo_id` のサブディレクトリに保存するようにしました。これにより複数のモデルファイルがキャッシュされます。`--model_dir` 直下の不要なファイルは削除願います。
 - 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
 - データセット設定の[英語版ドキュメント](./docs/config_README-en.md) が追加されました。PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) darkstorm2150 氏に感謝します。
-- データセット設定の `.toml` ファイルが UTF-8 encoding で読み込まれるようになりました。PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Horizon1704 氏に感謝します。
+- 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
+
 
 #### Example of dataset settings / データセット設定の記述例:
 
diff --git a/library/train_util.py b/library/train_util.py
index 23961505f..a13985ee2 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2936,13 +2936,13 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
     parser.add_argument(
         "--save_state",
         action="store_true",
-        help="save training state additionally (including optimizer states etc.) / optimizerなど学習状態も含めたstateを追加で保存する",
+        help="save training state additionally (including optimizer states etc.) when saving model / optimizerなど学習状態も含めたstateをモデル保存時に追加で保存する",
     )
     parser.add_argument(
         "--save_state_on_train_end",
         action="store_true",
-        help="save training state additionally (including optimizer states etc.) on train end / optimizerなど学習状態も含めたstateを追加で保存する",
-    )   
+        help="save training state (including optimizer states etc.) on train end / optimizerなど学習状態も含めたstateを学習完了時に保存する",
+    )
     parser.add_argument("--resume", type=str, default=None, help="saved state to resume training / 学習再開するモデルのstate")
 
     parser.add_argument("--train_batch_size", type=int, default=1, help="batch size for training / 学習時のバッチサイズ")
@@ -3550,7 +3550,7 @@ def read_config_from_file(args: argparse.Namespace, parser: argparse.ArgumentPar
         exit(1)
 
     logger.info(f"Loading settings from {config_path}...")
-    with open(config_path, "r", encoding='utf-8') as f:
+    with open(config_path, "r", encoding="utf-8") as f:
         config_dict = toml.load(f)
 
     # combine all sections into one

From d9456020d7547743c809a7c93f9a487276a66c74 Mon Sep 17 00:00:00 2001
From: BootsofLagrangian <hard2251@yonsei.ac.kr>
Date: Wed, 20 Mar 2024 20:52:59 +0900
Subject: [PATCH 040/132] Fix most of ZeRO stage uses optimizer partitioning  -
 we have to prepare optimizer and ds_model at the same time.  -
 pull/1139#issuecomment-1986790007

Signed-off-by: BootsofLagrangian <hard2251@yonsei.ac.kr>
---
 sdxl_train.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sdxl_train.py b/sdxl_train.py
index 613fe30b3..2cb80b6b3 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -412,7 +412,10 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             text_encoder1=text_encoder1 if train_text_encoder1 else None,
             text_encoder2=text_encoder2 if train_text_encoder2 else None,
         )
-        ds_model = accelerator.prepare(ds_model)
+        # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
+        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            ds_model, optimizer, train_dataloader, lr_scheduler
+        )
         training_models = [ds_model]
 
     else:
@@ -423,8 +426,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             text_encoder1 = accelerator.prepare(text_encoder1)
         if train_text_encoder2:
             text_encoder2 = accelerator.prepare(text_encoder2)
-
-    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
     # TextEncoderの出力をキャッシュするときにはCPUへ移動する
     if args.cache_text_encoder_outputs:

From d17c0f508416d734360393804732bfa420fe1c27 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 21 Mar 2024 08:31:29 +0900
Subject: [PATCH 041/132] update dataset config doc

---
 README.md                | 88 +---------------------------------------
 docs/config_README-en.md | 73 +++++++++++++++++++++++++++++++++
 docs/config_README-ja.md | 75 +++++++++++++++++++++++++++++++++-
 3 files changed, 148 insertions(+), 88 deletions(-)

diff --git a/README.md b/README.md
index 804bad84a..dae311325 100644
--- a/README.md
+++ b/README.md
@@ -261,7 +261,7 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
   - `enable_wildcard` is added. When set to `true`, the wildcard notation `{aaa|bbb|ccc}` can be used. See the example below.
   - `keep_tokens_separator` is updated to be used twice in the caption. When you specify `keep_tokens_separator="|||"`, the part divided by the second `|||` is not shuffled or dropped and remains at the end.
   - The existing features `caption_prefix` and `caption_suffix` can be used together. `caption_prefix` and `caption_suffix` are processed first, and then `enable_wildcard`, `keep_tokens_separator`, shuffling and dropping, and `secondary_separator` are processed in order.
-  - The examples are [shown below](#example-of-dataset-settings--データセット設定の記述例).
+  - See [Dataset config](./docs/config_README-en.md) for details.
 - The support for v3 repositories is added to `tag_image_by_wd14_tagger.py` (`--onnx` option only). PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) Thanks to sdbds!
   - Onnx may need to be updated. Onnx is not installed by default, so please install or update it with `pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` etc. Please also check the comments in `requirements.txt`.
 - The model is now saved in the subdirectory as `--repo_id` in `tag_image_by_wd14_tagger.py` . This caches multiple repo_id models. Please delete unnecessary files under `--model_dir`.
@@ -278,6 +278,7 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
   - `enable_wildcard` を追加しました。`true` にするとワイルドカード記法 `{aaa|bbb|ccc}` が使えます。詳しくは記述例をご覧ください。
   - `keep_tokens_separator` をキャプション内に 2 つ使えるようにしました。たとえば `keep_tokens_separator="|||"` と指定したとき、`1girl, hatsune miku, vocaloid ||| stage, mic ||| best quality, rating: general` とキャプションを指定すると、二番目の `|||` で分割された部分はシャッフル、drop されず末尾に残ります。
   - 既存の機能 `caption_prefix` と `caption_suffix` とあわせて使えます。`caption_prefix` と `caption_suffix` は一番最初に処理され、その後、ワイルドカード、`keep_tokens_separator`、シャッフルおよび drop、`secondary_separator` の順に処理されます。
+  - 詳細は [データセット設定](./docs/config_README-ja.md) をご覧ください。
 - `tag_image_by_wd14_tagger.py` で v3 のリポジトリがサポートされました（`--onnx` 指定時のみ有効）。 PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) sdbds 氏に感謝します。
   - Onnx のバージョンアップが必要になるかもしれません。デフォルトでは Onnx はインストールされていませんので、`pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` 等でインストール、アップデートしてください。`requirements.txt` のコメントもあわせてご確認ください。
 - `tag_image_by_wd14_tagger.py` で、モデルを`--repo_id` のサブディレクトリに保存するようにしました。これにより複数のモデルファイルがキャッシュされます。`--model_dir` 直下の不要なファイルは削除願います。
@@ -286,91 +287,6 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 - 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
 
 
-#### Example of dataset settings / データセット設定の記述例:
-
-```toml
-[general]
-flip_aug = true
-color_aug = false
-resolution = [1024, 1024]
-
-[[datasets]]
-batch_size = 6
-enable_bucket = true
-bucket_no_upscale = true
-caption_extension = ".txt"
-keep_tokens_separator= "|||"
-shuffle_caption = true
-caption_tag_dropout_rate = 0.1
-secondary_separator = ";;;" # subset 側に書くこともできます / can be written in the subset side
-enable_wildcard = true # 同上 / same as above
-
-  [[datasets.subsets]]
-  image_dir = "/path/to/image_dir"
-  num_repeats = 1
-
-  # ||| の前後はカンマは不要です（自動的に追加されます） / No comma is required before and after ||| (it is added automatically)
-  caption_prefix = "1girl, hatsune miku, vocaloid |||" 
-  
-  # ||| の後はシャッフル、drop されず残ります / After |||, it is not shuffled or dropped and remains
-  # 単純に文字列として連結されるので、カンマなどは自分で入れる必要があります / It is simply concatenated as a string, so you need to put commas yourself
-  caption_suffix = ", anime screencap ||| masterpiece, rating: general"
-```
-
-#### Example of caption, secondary_separator notation: `secondary_separator = ";;;"`
-
-```txt
-1girl, hatsune miku, vocaloid, upper body, looking at viewer, sky;;;cloud;;;day, outdoors
-```
-The part `sky;;;cloud;;;day` is replaced with `sky,cloud,day` without shuffling or dropping. When shuffling and dropping are enabled, it is processed as a whole (as one tag). For example, it becomes `vocaloid, 1girl, upper body, sky,cloud,day, outdoors, hatsune miku` (shuffled) or `vocaloid, 1girl, outdoors, looking at viewer, upper body, hatsune miku` (dropped).
-
-#### Example of caption, enable_wildcard notation: `enable_wildcard = true`
-
-```txt
-1girl, hatsune miku, vocaloid, upper body, looking at viewer, {simple|white} background
-```
-`simple` or `white` is randomly selected, and it becomes `simple background` or `white background`.
-
-```txt
-1girl, hatsune miku, vocaloid, {{retro style}}
-```
-If you want to include `{` or `}` in the tag string, double them like `{{` or `}}` (in this example, the actual caption used for training is `{retro style}`).
-
-#### Example of caption, `keep_tokens_separator` notation: `keep_tokens_separator = "|||"`
-
-```txt
-1girl, hatsune miku, vocaloid ||| stage, microphone, white shirt, smile ||| best quality, rating: general
-```
-It becomes `1girl, hatsune miku, vocaloid, microphone, stage, white shirt, best quality, rating: general` or `1girl, hatsune miku, vocaloid, white shirt, smile, stage, microphone, best quality, rating: general` etc.
-
-
-#### キャプション記述例、secondary_separator 記法：`secondary_separator = ";;;"` の場合
-
-```txt
-1girl, hatsune miku, vocaloid, upper body, looking at viewer, sky;;;cloud;;;day, outdoors
-```
-`sky;;;cloud;;;day` の部分はシャッフル、drop されず `sky,cloud,day` に置換されます。シャッフル、drop が有効な場合、まとめて（一つのタグとして）処理されます。つまり `vocaloid, 1girl, upper body, sky,cloud,day, outdoors, hatsune miku` （シャッフル）や `vocaloid, 1girl, outdoors, looking at viewer, upper body, hatsune miku` （drop されたケース）などになります。
-
-#### キャプション記述例、ワイルドカード記法： `enable_wildcard = true` の場合
-
-```txt
-1girl, hatsune miku, vocaloid, upper body, looking at viewer, {simple|white} background
-```
-ランダムに `simple` または `white` が選ばれ、`simple background` または `white background` になります。
-
-```txt
-1girl, hatsune miku, vocaloid, {{retro style}}
-```
-タグ文字列に `{` や `}` そのものを含めたい場合は `{{` や `}}` のように二つ重ねてください（この例では実際に学習に用いられるキャプションは `{retro style}` になります）。
-
-#### キャプション記述例、`keep_tokens_separator` 記法： `keep_tokens_separator = "|||"` の場合
-
-```txt
-1girl, hatsune miku, vocaloid ||| stage, microphone, white shirt, smile ||| best quality, rating: general
-```
-`1girl, hatsune miku, vocaloid, microphone, stage, white shirt, best quality, rating: general` や `1girl, hatsune miku, vocaloid, white shirt, smile, stage, microphone, best quality, rating: general` などになります。
-
-
 ### Mar 15, 2024 / 2024/3/15: v0.8.5
 
 - Fixed a bug that the value of timestep embedding during SDXL training was incorrect.
diff --git a/docs/config_README-en.md b/docs/config_README-en.md
index a0727934d..bdcaabfc7 100644
--- a/docs/config_README-en.md
+++ b/docs/config_README-en.md
@@ -1,7 +1,10 @@
 Original Source by kohya-ss
 
+First version:
 A.I Translation by Model: NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO, editing by Darkstorm2150
 
+Some parts are manually added.
+
 # Config Readme
 
 This README is about the configuration files that can be passed with the `--dataset_config` option.
@@ -143,11 +146,23 @@ These options are related to subset configuration.
 | `shuffle_caption` | `true` | o | o | o |
 | `caption_prefix` | `"masterpiece, best quality, "` | o | o | o |
 | `caption_suffix` | `", from side"` | o | o | o |
+| `caption_separator` |  (not specified) | o | o | o |
+| `keep_tokens_separator` | `“|||”` | o | o | o |
+| `secondary_separator` | `“;;;”` | o | o | o |
+| `enable_wildcard` | `true` | o | o | o |
 
 * `num_repeats`
     * Specifies the number of repeats for images in a subset. This is equivalent to `--dataset_repeats` in fine-tuning but can be specified for any training method.
 * `caption_prefix`, `caption_suffix`
     * Specifies the prefix and suffix strings to be appended to the captions. Shuffling is performed with these strings included. Be cautious when using `keep_tokens`.
+* `caption_separator`
+    * Specifies the string to separate the tags. The default is `,`. This option is usually not necessary to set.
+* `keep_tokens_separator`
+    * Specifies the string to separate the parts to be fixed in the caption. For example, if you specify `aaa, bbb ||| ccc, ddd, eee, fff ||| ggg, hhh`, the parts `aaa, bbb` and `ggg, hhh` will remain, and the rest will be shuffled and dropped. The comma in between is not necessary. As a result, the prompt will be `aaa, bbb, eee, ccc, fff, ggg, hhh` or `aaa, bbb, fff, ccc, eee, ggg, hhh`, etc.
+* `secondary_separator`
+    * Specifies an additional separator. The part separated by this separator is treated as one tag and is shuffled and dropped. It is then replaced by `caption_separator`. For example, if you specify `aaa;;;bbb;;;ccc`, it will be replaced by `aaa,bbb,ccc` or dropped together.
+* `enable_wildcard`
+    * Enables wildcard notation. This will be explained later.
 
 ### DreamBooth-specific options
 
@@ -276,4 +291,62 @@ As a temporary measure, we will list common errors and their solutions. If you e
 * `voluptuous.error.MultipleInvalid: expected int for dictionary value @ ...`: This error occurs when the specified value format is incorrect. It is highly likely that the value format is incorrect. The `int` part changes depending on the target option. The example configurations in this README may be helpful.
 * `voluptuous.error.MultipleInvalid: extra keys not allowed @ ...`: This error occurs when there is an option name that is not supported. It is highly likely that you misspelled the option name or mistakenly included it.
 
+## Miscellaneous
+
+### Example of configuration file, 設定ファイルの記述例
+
+```toml
+[general]
+flip_aug = true
+color_aug = false
+resolution = [1024, 1024]
+
+[[datasets]]
+batch_size = 6
+enable_bucket = true
+bucket_no_upscale = true
+caption_extension = ".txt"
+keep_tokens_separator= "|||"
+shuffle_caption = true
+caption_tag_dropout_rate = 0.1
+secondary_separator = ";;;" # subset 側に書くこともできます / can be written in the subset side
+enable_wildcard = true # 同上 / same as above
+
+  [[datasets.subsets]]
+  image_dir = "/path/to/image_dir"
+  num_repeats = 1
+
+  # ||| の前後はカンマは不要です（自動的に追加されます） / No comma is required before and after ||| (it is added automatically)
+  caption_prefix = "1girl, hatsune miku, vocaloid |||" 
+  
+  # ||| の後はシャッフル、drop されず残ります / After |||, it is not shuffled or dropped and remains
+  # 単純に文字列として連結されるので、カンマなどは自分で入れる必要があります / It is simply concatenated as a string, so you need to put commas yourself
+  caption_suffix = ", anime screencap ||| masterpiece, rating: general"
+```
+
+### Example of caption, secondary_separator notation: `secondary_separator = ";;;"`
+
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, sky;;;cloud;;;day, outdoors
+```
+The part `sky;;;cloud;;;day` is replaced with `sky,cloud,day` without shuffling or dropping. When shuffling and dropping are enabled, it is processed as a whole (as one tag). For example, it becomes `vocaloid, 1girl, upper body, sky,cloud,day, outdoors, hatsune miku` (shuffled) or `vocaloid, 1girl, outdoors, looking at viewer, upper body, hatsune miku` (dropped).
+
+### Example of caption, enable_wildcard notation: `enable_wildcard = true`
+
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, {simple|white} background
+```
+`simple` or `white` is randomly selected, and it becomes `simple background` or `white background`.
+
+```txt
+1girl, hatsune miku, vocaloid, {{retro style}}
+```
+If you want to include `{` or `}` in the tag string, double them like `{{` or `}}` (in this example, the actual caption used for training is `{retro style}`).
+
+### Example of caption, `keep_tokens_separator` notation: `keep_tokens_separator = "|||"`
+
+```txt
+1girl, hatsune miku, vocaloid ||| stage, microphone, white shirt, smile ||| best quality, rating: general
+```
+It becomes `1girl, hatsune miku, vocaloid, microphone, stage, white shirt, best quality, rating: general` or `1girl, hatsune miku, vocaloid, white shirt, smile, stage, microphone, best quality, rating: general` etc.
 
diff --git a/docs/config_README-ja.md b/docs/config_README-ja.md
index 69a03f6cf..47bb5c57d 100644
--- a/docs/config_README-ja.md
+++ b/docs/config_README-ja.md
@@ -1,5 +1,3 @@
-For non-Japanese speakers: this README is provided only in Japanese in the current state. Sorry for inconvenience. We will provide English version in the near future.
-
 `--dataset_config` で渡すことができる設定ファイルに関する説明です。
 
 ## 概要
@@ -140,12 +138,28 @@ DreamBooth の手法と fine tuning の手法の両方とも利用可能な学
 | `shuffle_caption` | `true` | o | o | o |
 | `caption_prefix` | `“masterpiece, best quality, ”` | o | o | o |
 | `caption_suffix` | `“, from side”` | o | o | o |
+| `caption_separator` | （通常は設定しません） | o | o | o |
+| `keep_tokens_separator` | `“|||”` | o | o | o |
+| `secondary_separator` | `“;;;”` | o | o | o |
+| `enable_wildcard` | `true` | o | o | o |
 
 * `num_repeats`
     * サブセットの画像の繰り返し回数を指定します。fine tuning における `--dataset_repeats` に相当しますが、`num_repeats` はどの学習方法でも指定可能です。
 * `caption_prefix`, `caption_suffix`
     * キャプションの前、後に付与する文字列を指定します。シャッフルはこれらの文字列を含めた状態で行われます。`keep_tokens` を指定する場合には注意してください。
 
+* `caption_separator`
+    * タグを区切る文字列を指定します。デフォルトは `,` です。このオプションは通常は設定する必要はありません。
+
+* `keep_tokens_separator`
+    *  キャプションで固定したい部分を区切る文字列を指定します。たとえば `aaa, bbb ||| ccc, ddd, eee, fff ||| ggg, hhh` のように指定すると、`aaa, bbb` と `ggg, hhh` の部分はシャッフル、drop されず残ります。間のカンマは不要です。結果としてプロンプトは `aaa, bbb, eee, ccc, fff, ggg, hhh` や `aaa, bbb, fff, ccc, eee, ggg, hhh` などになります。
+
+* `secondary_separator`
+    * 追加の区切り文字を指定します。この区切り文字で区切られた部分は一つのタグとして扱われ、シャッフル、drop されます。その後、`caption_separator` に置き換えられます。たとえば `aaa;;;bbb;;;ccc` のように指定すると、`aaa,bbb,ccc` に置き換えられるか、まとめて drop されます。
+
+* `enable_wildcard`
+    * ワイルドカード記法を有効にします。ワイルドカード記法については後述します。
+
 ### DreamBooth 方式専用のオプション
 
 DreamBooth 方式のオプションは、サブセット向けオプションのみ存在します。
@@ -280,4 +294,61 @@ resolution = 768
 * `voluptuous.error.MultipleInvalid: expected int for dictionary value @ ...`: 指定する値の形式が不正というエラーです。値の形式が間違っている可能性が高いです。`int` の部分は対象となるオプションによって変わります。この README に載っているオプションの「設定例」が役立つかもしれません。
 * `voluptuous.error.MultipleInvalid: extra keys not allowed @ ...`: 対応していないオプション名が存在している場合に発生するエラーです。オプション名を間違って記述しているか、誤って紛れ込んでいる可能性が高いです。
 
+## その他
+
+### Example of configuration file, 設定ファイルの記述例
 
+```toml
+[general]
+flip_aug = true
+color_aug = false
+resolution = [1024, 1024]
+
+[[datasets]]
+batch_size = 6
+enable_bucket = true
+bucket_no_upscale = true
+caption_extension = ".txt"
+keep_tokens_separator= "|||"
+shuffle_caption = true
+caption_tag_dropout_rate = 0.1
+secondary_separator = ";;;" # subset 側に書くこともできます / can be written in the subset side
+enable_wildcard = true # 同上 / same as above
+
+  [[datasets.subsets]]
+  image_dir = "/path/to/image_dir"
+  num_repeats = 1
+
+  # ||| の前後はカンマは不要です（自動的に追加されます） / No comma is required before and after ||| (it is added automatically)
+  caption_prefix = "1girl, hatsune miku, vocaloid |||" 
+  
+  # ||| の後はシャッフル、drop されず残ります / After |||, it is not shuffled or dropped and remains
+  # 単純に文字列として連結されるので、カンマなどは自分で入れる必要があります / It is simply concatenated as a string, so you need to put commas yourself
+  caption_suffix = ", anime screencap ||| masterpiece, rating: general"
+```
+
+### キャプション記述例、secondary_separator 記法：`secondary_separator = ";;;"` の場合
+
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, sky;;;cloud;;;day, outdoors
+```
+`sky;;;cloud;;;day` の部分はシャッフル、drop されず `sky,cloud,day` に置換されます。シャッフル、drop が有効な場合、まとめて（一つのタグとして）処理されます。つまり `vocaloid, 1girl, upper body, sky,cloud,day, outdoors, hatsune miku` （シャッフル）や `vocaloid, 1girl, outdoors, looking at viewer, upper body, hatsune miku` （drop されたケース）などになります。
+
+### キャプション記述例、ワイルドカード記法： `enable_wildcard = true` の場合
+
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, {simple|white} background
+```
+ランダムに `simple` または `white` が選ばれ、`simple background` または `white background` になります。
+
+```txt
+1girl, hatsune miku, vocaloid, {{retro style}}
+```
+タグ文字列に `{` や `}` そのものを含めたい場合は `{{` や `}}` のように二つ重ねてください（この例では実際に学習に用いられるキャプションは `{retro style}` になります）。
+
+### キャプション記述例、`keep_tokens_separator` 記法： `keep_tokens_separator = "|||"` の場合
+
+```txt
+1girl, hatsune miku, vocaloid ||| stage, microphone, white shirt, smile ||| best quality, rating: general
+```
+`1girl, hatsune miku, vocaloid, microphone, stage, white shirt, best quality, rating: general` や `1girl, hatsune miku, vocaloid, white shirt, smile, stage, microphone, best quality, rating: general` などになります。

From 594c7f70500e402586654e73501e7d8fc74592b8 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 23 Mar 2024 16:11:31 +0900
Subject: [PATCH 042/132] format by black

---
 finetune/merge_captions_to_metadata.py | 132 ++++++++++++++-----------
 finetune/merge_dd_tags_to_metadata.py  | 118 ++++++++++++----------
 2 files changed, 144 insertions(+), 106 deletions(-)

diff --git a/finetune/merge_captions_to_metadata.py b/finetune/merge_captions_to_metadata.py
index 60765b863..89f717473 100644
--- a/finetune/merge_captions_to_metadata.py
+++ b/finetune/merge_captions_to_metadata.py
@@ -6,75 +6,95 @@
 import library.train_util as train_util
 import os
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 def main(args):
-  assert not args.recursive or (args.recursive and args.full_path), "recursive requires full_path / recursiveはfull_pathと同時に指定してください"
+    assert not args.recursive or (
+        args.recursive and args.full_path
+    ), "recursive requires full_path / recursiveはfull_pathと同時に指定してください"
 
-  train_data_dir_path = Path(args.train_data_dir)
-  image_paths: List[Path] = train_util.glob_images_pathlib(train_data_dir_path, args.recursive)
-  logger.info(f"found {len(image_paths)} images.")
+    train_data_dir_path = Path(args.train_data_dir)
+    image_paths: List[Path] = train_util.glob_images_pathlib(train_data_dir_path, args.recursive)
+    logger.info(f"found {len(image_paths)} images.")
 
-  if args.in_json is None and Path(args.out_json).is_file():
-    args.in_json = args.out_json
+    if args.in_json is None and Path(args.out_json).is_file():
+        args.in_json = args.out_json
 
-  if args.in_json is not None:
-    logger.info(f"loading existing metadata: {args.in_json}")
-    metadata = json.loads(Path(args.in_json).read_text(encoding='utf-8'))
-    logger.warning("captions for existing images will be overwritten / 既存の画像のキャプションは上書きされます")
-  else:
-    logger.info("new metadata will be created / 新しいメタデータファイルが作成されます")
-    metadata = {}
+    if args.in_json is not None:
+        logger.info(f"loading existing metadata: {args.in_json}")
+        metadata = json.loads(Path(args.in_json).read_text(encoding="utf-8"))
+        logger.warning("captions for existing images will be overwritten / 既存の画像のキャプションは上書きされます")
+    else:
+        logger.info("new metadata will be created / 新しいメタデータファイルが作成されます")
+        metadata = {}
 
-  logger.info("merge caption texts to metadata json.")
-  for image_path in tqdm(image_paths):
-    caption_path = image_path.with_suffix(args.caption_extension)
-    caption = caption_path.read_text(encoding='utf-8').strip()
+    logger.info("merge caption texts to metadata json.")
+    for image_path in tqdm(image_paths):
+        caption_path = image_path.with_suffix(args.caption_extension)
+        caption = caption_path.read_text(encoding="utf-8").strip()
 
-    if not os.path.exists(caption_path):
-      caption_path = os.path.join(image_path, args.caption_extension)
+        if not os.path.exists(caption_path):
+            caption_path = os.path.join(image_path, args.caption_extension)
 
-    image_key = str(image_path) if args.full_path else image_path.stem
-    if image_key not in metadata:
-      metadata[image_key] = {}
+        image_key = str(image_path) if args.full_path else image_path.stem
+        if image_key not in metadata:
+            metadata[image_key] = {}
 
-    metadata[image_key]['caption'] = caption
-    if args.debug:
-      logger.info(f"{image_key} {caption}")
+        metadata[image_key]["caption"] = caption
+        if args.debug:
+            logger.info(f"{image_key} {caption}")
 
-  # metadataを書き出して終わり
-  logger.info(f"writing metadata: {args.out_json}")
-  Path(args.out_json).write_text(json.dumps(metadata, indent=2), encoding='utf-8')
-  logger.info("done!")
+    # metadataを書き出して終わり
+    logger.info(f"writing metadata: {args.out_json}")
+    Path(args.out_json).write_text(json.dumps(metadata, indent=2), encoding="utf-8")
+    logger.info("done!")
 
 
 def setup_parser() -> argparse.ArgumentParser:
-  parser = argparse.ArgumentParser()
-  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
-  parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
-  parser.add_argument("--in_json", type=str,
-                      help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル（省略時、out_jsonが存在すればそれを読み込む）")
-  parser.add_argument("--caption_extention", type=str, default=None,
-                      help="extension of caption file (for backward compatibility) / 読み込むキャプションファイルの拡張子（スペルミスしていたのを残してあります）")
-  parser.add_argument("--caption_extension", type=str, default=".caption", help="extension of caption file / 読み込むキャプションファイルの拡張子")
-  parser.add_argument("--full_path", action="store_true",
-                      help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする（複数の学習画像ディレクトリに対応）")
-  parser.add_argument("--recursive", action="store_true",
-                      help="recursively look for training tags in all child folders of train_data_dir / train_data_dirのすべての子フォルダにある学習タグを再帰的に探す")
-  parser.add_argument("--debug", action="store_true", help="debug mode")
-
-  return parser
-
-
-if __name__ == '__main__':
-  parser = setup_parser()
-
-  args = parser.parse_args()
-
-  # スペルミスしていたオプションを復元する
-  if args.caption_extention is not None:
-    args.caption_extension = args.caption_extention
-
-  main(args)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+    parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
+    parser.add_argument(
+        "--in_json",
+        type=str,
+        help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル（省略時、out_jsonが存在すればそれを読み込む）",
+    )
+    parser.add_argument(
+        "--caption_extention",
+        type=str,
+        default=None,
+        help="extension of caption file (for backward compatibility) / 読み込むキャプションファイルの拡張子（スペルミスしていたのを残してあります）",
+    )
+    parser.add_argument(
+        "--caption_extension", type=str, default=".caption", help="extension of caption file / 読み込むキャプションファイルの拡張子"
+    )
+    parser.add_argument(
+        "--full_path",
+        action="store_true",
+        help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする（複数の学習画像ディレクトリに対応）",
+    )
+    parser.add_argument(
+        "--recursive",
+        action="store_true",
+        help="recursively look for training tags in all child folders of train_data_dir / train_data_dirのすべての子フォルダにある学習タグを再帰的に探す",
+    )
+    parser.add_argument("--debug", action="store_true", help="debug mode")
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+
+    # スペルミスしていたオプションを復元する
+    if args.caption_extention is not None:
+        args.caption_extension = args.caption_extention
+
+    main(args)
diff --git a/finetune/merge_dd_tags_to_metadata.py b/finetune/merge_dd_tags_to_metadata.py
index 9ef8f14b0..ce22d990e 100644
--- a/finetune/merge_dd_tags_to_metadata.py
+++ b/finetune/merge_dd_tags_to_metadata.py
@@ -6,70 +6,88 @@
 import library.train_util as train_util
 import os
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 def main(args):
-  assert not args.recursive or (args.recursive and args.full_path), "recursive requires full_path / recursiveはfull_pathと同時に指定してください"
+    assert not args.recursive or (
+        args.recursive and args.full_path
+    ), "recursive requires full_path / recursiveはfull_pathと同時に指定してください"
 
-  train_data_dir_path = Path(args.train_data_dir)
-  image_paths: List[Path] = train_util.glob_images_pathlib(train_data_dir_path, args.recursive)
-  logger.info(f"found {len(image_paths)} images.")
+    train_data_dir_path = Path(args.train_data_dir)
+    image_paths: List[Path] = train_util.glob_images_pathlib(train_data_dir_path, args.recursive)
+    logger.info(f"found {len(image_paths)} images.")
 
-  if args.in_json is None and Path(args.out_json).is_file():
-    args.in_json = args.out_json
+    if args.in_json is None and Path(args.out_json).is_file():
+        args.in_json = args.out_json
 
-  if args.in_json is not None:
-    logger.info(f"loading existing metadata: {args.in_json}")
-    metadata = json.loads(Path(args.in_json).read_text(encoding='utf-8'))
-    logger.warning("tags data for existing images will be overwritten / 既存の画像のタグは上書きされます")
-  else:
-    logger.info("new metadata will be created / 新しいメタデータファイルが作成されます")
-    metadata = {}
+    if args.in_json is not None:
+        logger.info(f"loading existing metadata: {args.in_json}")
+        metadata = json.loads(Path(args.in_json).read_text(encoding="utf-8"))
+        logger.warning("tags data for existing images will be overwritten / 既存の画像のタグは上書きされます")
+    else:
+        logger.info("new metadata will be created / 新しいメタデータファイルが作成されます")
+        metadata = {}
 
-  logger.info("merge tags to metadata json.")
-  for image_path in tqdm(image_paths):
-    tags_path = image_path.with_suffix(args.caption_extension)
-    tags = tags_path.read_text(encoding='utf-8').strip()
+    logger.info("merge tags to metadata json.")
+    for image_path in tqdm(image_paths):
+        tags_path = image_path.with_suffix(args.caption_extension)
+        tags = tags_path.read_text(encoding="utf-8").strip()
 
-    if not os.path.exists(tags_path):
-      tags_path = os.path.join(image_path, args.caption_extension)
+        if not os.path.exists(tags_path):
+            tags_path = os.path.join(image_path, args.caption_extension)
 
-    image_key = str(image_path) if args.full_path else image_path.stem
-    if image_key not in metadata:
-      metadata[image_key] = {}
+        image_key = str(image_path) if args.full_path else image_path.stem
+        if image_key not in metadata:
+            metadata[image_key] = {}
 
-    metadata[image_key]['tags'] = tags
-    if args.debug:
-      logger.info(f"{image_key} {tags}")
+        metadata[image_key]["tags"] = tags
+        if args.debug:
+            logger.info(f"{image_key} {tags}")
 
-  # metadataを書き出して終わり
-  logger.info(f"writing metadata: {args.out_json}")
-  Path(args.out_json).write_text(json.dumps(metadata, indent=2), encoding='utf-8')
+    # metadataを書き出して終わり
+    logger.info(f"writing metadata: {args.out_json}")
+    Path(args.out_json).write_text(json.dumps(metadata, indent=2), encoding="utf-8")
 
-  logger.info("done!")
+    logger.info("done!")
 
 
 def setup_parser() -> argparse.ArgumentParser:
-  parser = argparse.ArgumentParser()
-  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
-  parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
-  parser.add_argument("--in_json", type=str,
-                      help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル（省略時、out_jsonが存在すればそれを読み込む）")
-  parser.add_argument("--full_path", action="store_true",
-                      help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする（複数の学習画像ディレクトリに対応）")
-  parser.add_argument("--recursive", action="store_true",
-                      help="recursively look for training tags in all child folders of train_data_dir / train_data_dirのすべての子フォルダにある学習タグを再帰的に探す")
-  parser.add_argument("--caption_extension", type=str, default=".txt",
-                      help="extension of caption (tag) file / 読み込むキャプション（タグ）ファイルの拡張子")
-  parser.add_argument("--debug", action="store_true", help="debug mode, print tags")
-
-  return parser
-
-
-if __name__ == '__main__':
-  parser = setup_parser()
-
-  args = parser.parse_args()
-  main(args)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+    parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
+    parser.add_argument(
+        "--in_json",
+        type=str,
+        help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル（省略時、out_jsonが存在すればそれを読み込む）",
+    )
+    parser.add_argument(
+        "--full_path",
+        action="store_true",
+        help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする（複数の学習画像ディレクトリに対応）",
+    )
+    parser.add_argument(
+        "--recursive",
+        action="store_true",
+        help="recursively look for training tags in all child folders of train_data_dir / train_data_dirのすべての子フォルダにある学習タグを再帰的に探す",
+    )
+    parser.add_argument(
+        "--caption_extension",
+        type=str,
+        default=".txt",
+        help="extension of caption (tag) file / 読み込むキャプション（タグ）ファイルの拡張子",
+    )
+    parser.add_argument("--debug", action="store_true", help="debug mode, print tags")
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args = parser.parse_args()
+    main(args)

From f4a4c11cd30a885d1d5ddb86bee609305c5398f3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 23 Mar 2024 18:51:37 +0900
Subject: [PATCH 043/132] support multiline captions ref #1155

---
 README.md                |  8 ++++----
 docs/config_README-en.md | 30 +++++++++++++++++++++++++++++-
 docs/config_README-ja.md | 32 ++++++++++++++++++++++++++++++--
 library/train_util.py    | 38 +++++++++++++++++++++++++++++++-------
 4 files changed, 94 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index dae311325..dd000d126 100644
--- a/README.md
+++ b/README.md
@@ -257,8 +257,8 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 - `train_network.py` and `sdxl_train_network.py` are modified to record some dataset settings in the metadata of the trained model (`caption_prefix`, `caption_suffix`, `keep_tokens_separator`, `secondary_separator`, `enable_wildcard`).
 - Some features are added to the dataset subset settings.
   - `secondary_separator` is added to specify the tag separator that is not the target of shuffling or dropping. 
-    - Specify `secondary_separator=";;;"`. When you specify `secondary_separator`, the part is not shuffled or dropped. See the example below.
-  - `enable_wildcard` is added. When set to `true`, the wildcard notation `{aaa|bbb|ccc}` can be used. See the example below.
+    - Specify `secondary_separator=";;;"`. When you specify `secondary_separator`, the part is not shuffled or dropped. 
+  - `enable_wildcard` is added. When set to `true`, the wildcard notation `{aaa|bbb|ccc}` can be used. The multi-line caption is also enabled.
   - `keep_tokens_separator` is updated to be used twice in the caption. When you specify `keep_tokens_separator="|||"`, the part divided by the second `|||` is not shuffled or dropped and remains at the end.
   - The existing features `caption_prefix` and `caption_suffix` can be used together. `caption_prefix` and `caption_suffix` are processed first, and then `enable_wildcard`, `keep_tokens_separator`, shuffling and dropping, and `secondary_separator` are processed in order.
   - See [Dataset config](./docs/config_README-en.md) for details.
@@ -274,8 +274,8 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 - データセット設定の `.toml` ファイルが UTF-8 encoding で読み込まれるようになりました。PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Horizon1704 氏に感謝します。
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
 - データセットのサブセット設定にいくつかの機能を追加しました。
-  - シャッフルの対象とならないタグ分割識別子の指定 `secondary_separator` を追加しました。`secondary_separator=";;;"` のように指定します。`secondary_separator` で区切ることで、その部分はシャッフル、drop 時にまとめて扱われます。詳しくは記述例をご覧ください。
-  - `enable_wildcard` を追加しました。`true` にするとワイルドカード記法 `{aaa|bbb|ccc}` が使えます。詳しくは記述例をご覧ください。
+  - シャッフルの対象とならないタグ分割識別子の指定 `secondary_separator` を追加しました。`secondary_separator=";;;"` のように指定します。`secondary_separator` で区切ることで、その部分はシャッフル、drop 時にまとめて扱われます。
+  - `enable_wildcard` を追加しました。`true` にするとワイルドカード記法 `{aaa|bbb|ccc}` が使えます。また複数行キャプションも有効になります。
   - `keep_tokens_separator` をキャプション内に 2 つ使えるようにしました。たとえば `keep_tokens_separator="|||"` と指定したとき、`1girl, hatsune miku, vocaloid ||| stage, mic ||| best quality, rating: general` とキャプションを指定すると、二番目の `|||` で分割された部分はシャッフル、drop されず末尾に残ります。
   - 既存の機能 `caption_prefix` と `caption_suffix` とあわせて使えます。`caption_prefix` と `caption_suffix` は一番最初に処理され、その後、ワイルドカード、`keep_tokens_separator`、シャッフルおよび drop、`secondary_separator` の順に処理されます。
   - 詳細は [データセット設定](./docs/config_README-ja.md) をご覧ください。
diff --git a/docs/config_README-en.md b/docs/config_README-en.md
index bdcaabfc7..e99fde216 100644
--- a/docs/config_README-en.md
+++ b/docs/config_README-en.md
@@ -293,7 +293,35 @@ As a temporary measure, we will list common errors and their solutions. If you e
 
 ## Miscellaneous
 
-### Example of configuration file, 設定ファイルの記述例
+### Multi-line captions
+
+By setting `enable_wildcard = true`, multiple-line captions are also enabled. If the caption file consists of multiple lines, one line is randomly selected as the caption. 
+
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, microphone, stage
+a girl with a microphone standing on a stage
+detailed digital art of a girl with a microphone on a stage
+```
+
+It can be combined with wildcard notation.
+
+In metadata files, you can also specify multiple-line captions. In the `.json` metadata file, use `\n` to represent a line break. If the caption file consists of multiple lines, `merge_captions_to_metadata.py` will create a metadata file in this format.
+
+The tags in the metadata (`tags`) are added to each line of the caption.
+
+```json
+{
+    "/path/to/image.png": {
+        "caption": "a cartoon of a frog with the word frog on it\ntest multiline caption1\ntest multiline caption2",
+        "tags": "open mouth, simple background, standing, no humans, animal, black background, frog, animal costume, animal focus"
+    },
+    ...
+}
+```
+
+In this case, the actual caption will be `a cartoon of a frog with the word frog on it, open mouth, simple background ...`, `test multiline caption1, open mouth, simple background ...`, `test multiline caption2, open mouth, simple background ...`, etc.
+
+### Example of configuration file : `secondary_separator`, wildcard notation, `keep_tokens_separator`, etc.
 
 ```toml
 [general]
diff --git a/docs/config_README-ja.md b/docs/config_README-ja.md
index 47bb5c57d..b57ae86a7 100644
--- a/docs/config_README-ja.md
+++ b/docs/config_README-ja.md
@@ -158,7 +158,7 @@ DreamBooth の手法と fine tuning の手法の両方とも利用可能な学
     * 追加の区切り文字を指定します。この区切り文字で区切られた部分は一つのタグとして扱われ、シャッフル、drop されます。その後、`caption_separator` に置き換えられます。たとえば `aaa;;;bbb;;;ccc` のように指定すると、`aaa,bbb,ccc` に置き換えられるか、まとめて drop されます。
 
 * `enable_wildcard`
-    * ワイルドカード記法を有効にします。ワイルドカード記法については後述します。
+    * ワイルドカード記法および複数行キャプションを有効にします。ワイルドカード記法、複数行キャプションについては後述します。
 
 ### DreamBooth 方式専用のオプション
 
@@ -296,7 +296,35 @@ resolution = 768
 
 ## その他
 
-### Example of configuration file, 設定ファイルの記述例
+### 複数行キャプション
+
+`enable_wildcard = true` を設定することで、複数行キャプションも同時に有効になります。キャプションファイルが複数の行からなる場合、ランダムに一つの行が選ばれてキャプションとして利用されます。
+
+```txt
+1girl, hatsune miku, vocaloid, upper body, looking at viewer, microphone, stage
+a girl with a microphone standing on a stage
+detailed digital art of a girl with a microphone on a stage
+```
+
+ワイルドカード記法と組み合わせることも可能です。
+
+メタデータファイルでも同様に複数行キャプションを指定することができます。メタデータの .json 内には、`\n` を使って改行を表現してください。キャプションファイルが複数行からなる場合、`merge_captions_to_metadata.py` を使うと、この形式でメタデータファイルが作成されます。
+
+メタデータのタグ (`tags`) は、キャプションの各行に追加されます。
+
+```json
+{
+    "/path/to/image.png": {
+        "caption": "a cartoon of a frog with the word frog on it\ntest multiline caption1\ntest multiline caption2",
+        "tags": "open mouth, simple background, standing, no humans, animal, black background, frog, animal costume, animal focus"
+    },
+    ...
+}
+```
+
+この場合、実際のキャプションは `a cartoon of a frog with the word frog on it, open mouth, simple background ...` または `test multiline caption1, open mouth, simple background ...`、 `test multiline caption2, open mouth, simple background ...` 等になります。
+
+### 設定ファイルの記述例：追加の区切り文字、ワイルドカード記法、`keep_tokens_separator` 等
 
 ```toml
 [general]
diff --git a/library/train_util.py b/library/train_util.py
index a13985ee2..d076cf847 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -693,6 +693,10 @@ def process_caption(self, subset: BaseSubset, caption):
         else:
             # process wildcards
             if subset.enable_wildcard:
+                # if caption is multiline, random choice one line
+                if "\n" in caption:
+                    caption = random.choice(caption.split("\n"))
+
                 # wildcard is like '{aaa|bbb|ccc...}'
                 # escape the curly braces like {{ or }}
                 replacer1 = "⦅"
@@ -711,6 +715,9 @@ def replace_wildcard(match):
 
                 # unescape the curly braces
                 caption = caption.replace(replacer1, "{").replace(replacer2, "}")
+            else:
+                # if caption is multiline, use the first line
+                caption = caption.split("\n")[0]
 
             if subset.shuffle_caption or subset.token_warmup_step > 0 or subset.caption_tag_dropout_rate > 0:
                 fixed_tokens = []
@@ -1446,7 +1453,7 @@ def __init__(
             self.bucket_reso_steps = None  # この情報は使われない
             self.bucket_no_upscale = False
 
-        def read_caption(img_path, caption_extension):
+        def read_caption(img_path, caption_extension, enable_wildcard):
             # captionの候補ファイル名を作る
             base_name = os.path.splitext(img_path)[0]
             base_name_face_det = base_name
@@ -1465,7 +1472,10 @@ def read_caption(img_path, caption_extension):
                             logger.error(f"illegal char in file (not UTF-8) / ファイルにUTF-8以外の文字があります: {cap_path}")
                             raise e
                         assert len(lines) > 0, f"caption file is empty / キャプションファイルが空です: {cap_path}"
-                        caption = lines[0].strip()
+                        if enable_wildcard:
+                            caption = "\n".join([line.strip() for line in lines if line.strip() != ""]) # 空行を除く、改行で連結
+                        else:
+                            caption = lines[0].strip()
                     break
             return caption
 
@@ -1481,7 +1491,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
             captions = []
             missing_captions = []
             for img_path in img_paths:
-                cap_for_img = read_caption(img_path, subset.caption_extension)
+                cap_for_img = read_caption(img_path, subset.caption_extension, subset.enable_wildcard)
                 if cap_for_img is None and subset.class_tokens is None:
                     logger.warning(
                         f"neither caption file nor class tokens are found. use empty caption for {img_path} / キャプションファイルもclass tokenも見つかりませんでした。空のキャプションを使用します: {img_path}"
@@ -1657,10 +1667,24 @@ def __init__(
                 caption = img_md.get("caption")
                 tags = img_md.get("tags")
                 if caption is None:
-                    caption = tags
-                elif tags is not None and len(tags) > 0:
-                    caption = caption + ", " + tags
-                    tags_list.append(tags)
+                    caption = tags  # could be multiline
+                    tags = None
+
+                if subset.enable_wildcard:
+                    # tags must be single line
+                    if tags is not None:
+                        tags = tags.replace("\n", subset.caption_separator)
+
+                    # add tags to each line of caption
+                    if caption is not None and tags is not None:
+                        caption = "\n".join(
+                            [f"{line}{subset.caption_separator}{tags}" for line in caption.split("\n") if line.strip() != ""]
+                        )
+                else:
+                    # use as is
+                    if tags is not None and len(tags) > 0:
+                        caption = caption + subset.caption_separator + tags
+                        tags_list.append(tags)
 
                 if caption is None:
                     caption = ""

From 0c7baea88cfa98c5fab2898551c426f2d4fac4c6 Mon Sep 17 00:00:00 2001
From: feffy380 <114889020+feffy380@users.noreply.github.com>
Date: Sat, 23 Mar 2024 17:28:02 +0100
Subject: [PATCH 044/132] register reg images with correct subset

---
 library/train_util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index d076cf847..b69fb0950 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1554,7 +1554,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
             for img_path, caption in zip(img_paths, captions):
                 info = ImageInfo(img_path, subset.num_repeats, caption, subset.is_reg, img_path)
                 if subset.is_reg:
-                    reg_infos.append(info)
+                    reg_infos.append((info, subset))
                 else:
                     self.register_image(info, subset)
 
@@ -1575,7 +1575,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
             n = 0
             first_loop = True
             while n < num_train_images:
-                for info in reg_infos:
+                for info, subset in reg_infos:
                     if first_loop:
                         self.register_image(info, subset)
                         n += info.num_repeats

From 79d1c12ab056e7114257d7079f2f8846e329320e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Mar 2024 11:06:37 +0900
Subject: [PATCH 045/132] disable sample_every_n_xxx if value less than 1 ref
 #1202

---
 library/train_util.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index d076cf847..8fbf3283d 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1473,7 +1473,7 @@ def read_caption(img_path, caption_extension, enable_wildcard):
                             raise e
                         assert len(lines) > 0, f"caption file is empty / キャプションファイルが空です: {cap_path}"
                         if enable_wildcard:
-                            caption = "\n".join([line.strip() for line in lines if line.strip() != ""]) # 空行を除く、改行で連結
+                            caption = "\n".join([line.strip() for line in lines if line.strip() != ""])  # 空行を除く、改行で連結
                         else:
                             caption = lines[0].strip()
                     break
@@ -3338,6 +3338,18 @@ def verify_training_args(args: argparse.Namespace):
             + " / zero_terminal_snrが有効ですが、v_parameterizationが有効ではありません。学習結果は想定外になる可能性があります"
         )
 
+    if args.sample_every_n_epochs is not None and args.sample_every_n_epochs <= 0:
+        logger.warning(
+            "sample_every_n_epochs is less than or equal to 0, so it will be disabled / sample_every_n_epochsに0以下の値が指定されたため無効になります"
+        )
+        args.sample_every_n_epochs = None
+
+    if args.sample_every_n_steps is not None and args.sample_every_n_steps <= 0:
+        logger.warning(
+            "sample_every_n_steps is less than or equal to 0, so it will be disabled / sample_every_n_stepsに0以下の値が指定されたため無効になります"
+        )
+        args.sample_every_n_steps = None
+
 
 def add_dataset_arguments(
     parser: argparse.ArgumentParser, support_dreambooth: bool, support_caption: bool, support_caption_dropout: bool

From 691f04322a48566caf62dd67c2834ca2748c064f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Mar 2024 11:10:26 +0900
Subject: [PATCH 046/132] update readme

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index dd000d126..25226dff3 100644
--- a/README.md
+++ b/README.md
@@ -266,8 +266,9 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
   - Onnx may need to be updated. Onnx is not installed by default, so please install or update it with `pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` etc. Please also check the comments in `requirements.txt`.
 - The model is now saved in the subdirectory as `--repo_id` in `tag_image_by_wd14_tagger.py` . This caches multiple repo_id models. Please delete unnecessary files under `--model_dir`.
 - The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
-- The [English version of the dataset settings documentation](./docs/config_README-en.md) is added. PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) Thanks to darkstorm2150!
 - The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
+- The options `--sample_every_n_epochs` and `--sample_every_n_steps` in each training script now display a warning and ignore them when a number less than or equal to `0` is specified. Thanks to S-Del for raising the issue.
+- The [English version of the dataset settings documentation](./docs/config_README-en.md) is added. PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) Thanks to darkstorm2150!
 
 
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
@@ -283,8 +284,9 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
   - Onnx のバージョンアップが必要になるかもしれません。デフォルトでは Onnx はインストールされていませんので、`pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` 等でインストール、アップデートしてください。`requirements.txt` のコメントもあわせてご確認ください。
 - `tag_image_by_wd14_tagger.py` で、モデルを`--repo_id` のサブディレクトリに保存するようにしました。これにより複数のモデルファイルがキャッシュされます。`--model_dir` 直下の不要なファイルは削除願います。
 - 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
-- データセット設定の[英語版ドキュメント](./docs/config_README-en.md) が追加されました。PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) darkstorm2150 氏に感謝します。
 - 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
+- 各学習スクリプトで `--sample_every_n_epochs` および `--sample_every_n_steps` オプションに `0` 以下の数値を指定した時、警告を表示するとともにそれらを無視するよう変更しました。問題提起していただいた S-Del 氏に感謝します。
+- データセット設定の[英語版ドキュメント](./docs/config_README-en.md) が追加されました。PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) darkstorm2150 氏に感謝します。
 
 
 ### Mar 15, 2024 / 2024/3/15: v0.8.5

From 381c44955e72f04b57c74aa9b3d9a43c839c631f Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Mar 2024 11:27:18 +0900
Subject: [PATCH 047/132] update readme and typing hint

---
 README.md             | 2 ++
 library/train_util.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 25226dff3..a19f7968a 100644
--- a/README.md
+++ b/README.md
@@ -254,6 +254,7 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 
 - Colab seems to stop with log output. Try specifying `--console_log_simple` option in the training script to disable rich logging.
 - The `.toml` file for the dataset config is now read in UTF-8 encoding. PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Thanks to Horizon1704!
+- Fixed a bug that the last subset settings are applied to all images when multiple subsets of regularization images are specified in the dataset settings. The settings for each subset are correctly applied to each image. PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) Thanks to feffy380!
 - `train_network.py` and `sdxl_train_network.py` are modified to record some dataset settings in the metadata of the trained model (`caption_prefix`, `caption_suffix`, `keep_tokens_separator`, `secondary_separator`, `enable_wildcard`).
 - Some features are added to the dataset subset settings.
   - `secondary_separator` is added to specify the tag separator that is not the target of shuffling or dropping. 
@@ -273,6 +274,7 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
 - データセット設定の `.toml` ファイルが UTF-8 encoding で読み込まれるようになりました。PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Horizon1704 氏に感謝します。
+- データセット設定で、正則化画像のサブセットを複数指定した時、最後のサブセットの各種設定がすべてのサブセットの画像に適用される不具合が修正されました。それぞれのサブセットの設定が、それぞれの画像に正しく適用されます。PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) feffy380 氏に感謝します。
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
 - データセットのサブセット設定にいくつかの機能を追加しました。
   - シャッフルの対象とならないタグ分割識別子の指定 `secondary_separator` を追加しました。`secondary_separator=";;;"` のように指定します。`secondary_separator` で区切ることで、その部分はシャッフル、drop 時にまとめて扱われます。
diff --git a/library/train_util.py b/library/train_util.py
index ce6e09245..99aeea90d 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1525,7 +1525,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
         logger.info("prepare images.")
         num_train_images = 0
         num_reg_images = 0
-        reg_infos: List[ImageInfo] = []
+        reg_infos: List[Tuple[ImageInfo, DreamBoothSubset]] = []
         for subset in subsets:
             if subset.num_repeats < 1:
                 logger.warning(

From ae97c8bfd18e4b51bdeae0a72753c8e9ceeff29d Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sun, 24 Mar 2024 14:40:18 +0800
Subject: [PATCH 048/132] [Experimental] Add cache mechanism for dataset groups
 to avoid long waiting time for initilization (#1178)

* support meta cached dataset

* add cache meta scripts

* random ip_noise_gamma strength

* random noise_offset strength

* use correct settings for parser

* cache path/caption/size only

* revert mess up commit

* revert mess up commit

* Update requirements.txt

* Add arguments for meta cache.

* remove pickle implementation

* Return sizes when enable cache

---------

Co-authored-by: Kohya S <52813779+kohya-ss@users.noreply.github.com>
---
 cache_dataset_meta.py  | 103 +++++++++++++++++++++++++++++++++++++++++
 library/config_util.py |   4 ++
 library/train_util.py  |  83 ++++++++++++++++++++++++---------
 requirements.txt       |   2 +
 train_network.py       |   3 +-
 5 files changed, 173 insertions(+), 22 deletions(-)
 create mode 100644 cache_dataset_meta.py

diff --git a/cache_dataset_meta.py b/cache_dataset_meta.py
new file mode 100644
index 000000000..7e7d96d12
--- /dev/null
+++ b/cache_dataset_meta.py
@@ -0,0 +1,103 @@
+import argparse
+import random
+
+from accelerate.utils import set_seed
+
+import library.train_util as train_util
+import library.config_util as config_util
+from library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+import library.custom_train_functions as custom_train_functions
+from library.utils import setup_logging, add_logging_arguments
+
+setup_logging()
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def make_dataset(args):
+    train_util.prepare_dataset_args(args, True)
+    setup_logging(args, reset=True)
+
+    use_dreambooth_method = args.in_json is None
+    use_user_config = args.dataset_config is not None
+
+    if args.seed is None:
+        args.seed = random.randint(0, 2**32)
+    set_seed(args.seed)
+
+    # データセットを準備する
+    if args.dataset_class is None:
+        blueprint_generator = BlueprintGenerator(
+            ConfigSanitizer(True, True, False, True)
+        )
+        if use_user_config:
+            logger.info(f"Loading dataset config from {args.dataset_config}")
+            user_config = config_util.load_user_config(args.dataset_config)
+            ignored = ["train_data_dir", "reg_data_dir", "in_json"]
+            if any(getattr(args, attr) is not None for attr in ignored):
+                logger.warning(
+                    "ignoring the following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                        ", ".join(ignored)
+                    )
+                )
+        else:
+            if use_dreambooth_method:
+                logger.info("Using DreamBooth method.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
+                                args.train_data_dir, args.reg_data_dir
+                            )
+                        }
+                    ]
+                }
+            else:
+                logger.info("Training with captions.")
+                user_config = {
+                    "datasets": [
+                        {
+                            "subsets": [
+                                {
+                                    "image_dir": args.train_data_dir,
+                                    "metadata_file": args.in_json,
+                                }
+                            ]
+                        }
+                    ]
+                }
+
+        blueprint = blueprint_generator.generate(user_config, args, tokenizer=None)
+        train_dataset_group = config_util.generate_dataset_group_by_blueprint(
+            blueprint.dataset_group
+        )
+    else:
+        # use arbitrary dataset class
+        train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizer=None)
+    return train_dataset_group
+
+
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    add_logging_arguments(parser)
+    train_util.add_dataset_arguments(parser, True, True, True)
+    train_util.add_training_arguments(parser, True)
+    config_util.add_config_arguments(parser)
+    custom_train_functions.add_custom_train_arguments(parser)
+    return parser
+
+
+if __name__ == "__main__":
+    parser = setup_parser()
+
+    args, unknown = parser.parse_known_args()
+    args = train_util.read_config_from_file(args, parser)
+    if args.max_token_length is None:
+        args.max_token_length = 75
+    args.cache_meta = True
+
+    dataset_group = make_dataset(args)
diff --git a/library/config_util.py b/library/config_util.py
index eb652ecf3..58ffa5f4d 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -111,6 +111,8 @@ class DreamBoothDatasetParams(BaseDatasetParams):
     bucket_reso_steps: int = 64
     bucket_no_upscale: bool = False
     prior_loss_weight: float = 1.0
+    cache_meta: bool = False
+    use_cached_meta: bool = False
 
 
 @dataclass
@@ -228,6 +230,8 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "min_bucket_reso": int,
         "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
         "network_multiplier": float,
+        "cache_meta": bool,
+        "use_cached_meta": bool,
     }
 
     # options handled by argparse but not handled by user config
diff --git a/library/train_util.py b/library/train_util.py
index 99aeea90d..58c0cc14b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -63,6 +63,7 @@
 from huggingface_hub import hf_hub_download
 import numpy as np
 from PIL import Image
+import imagesize
 import cv2
 import safetensors.torch
 from library.lpw_stable_diffusion import StableDiffusionLongPromptWeightingPipeline
@@ -1080,8 +1081,7 @@ def cache_text_encoder_outputs(
             )
 
     def get_image_size(self, image_path):
-        image = Image.open(image_path)
-        return image.size
+        return imagesize.get(image_path)
 
     def load_image_with_face_info(self, subset: BaseSubset, image_path: str):
         img = load_image(image_path)
@@ -1425,6 +1425,8 @@ def __init__(
         bucket_no_upscale: bool,
         prior_loss_weight: float,
         debug_dataset: bool,
+        cache_meta: bool,
+        use_cached_meta: bool,
     ) -> None:
         super().__init__(tokenizer, max_token_length, resolution, network_multiplier, debug_dataset)
 
@@ -1484,26 +1486,43 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 logger.warning(f"not directory: {subset.image_dir}")
                 return [], []
 
-            img_paths = glob_images(subset.image_dir, "*")
+            sizes = None
+            if use_cached_meta:
+                logger.info(f"using cached metadata: {subset.image_dir}/dataset.txt")
+                # [img_path, caption, resolution]
+                with open(f"{subset.image_dir}/dataset.txt", "r", encoding="utf-8") as f:
+                    metas = f.readlines()
+                metas = [x.strip().split("<|##|>") for x in metas]
+                sizes = [tuple(int(res) for res in x[2].split(" ")) for x in metas]
+            
+            if use_cached_meta:
+                img_paths = [x[0] for x in metas]
+            else:
+                img_paths = glob_images(subset.image_dir, "*")
+                sizes = [None]*len(img_paths)
             logger.info(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
 
-            # 画像ファイルごとにプロンプトを読み込み、もしあればそちらを使う
-            captions = []
-            missing_captions = []
-            for img_path in img_paths:
-                cap_for_img = read_caption(img_path, subset.caption_extension, subset.enable_wildcard)
-                if cap_for_img is None and subset.class_tokens is None:
-                    logger.warning(
-                        f"neither caption file nor class tokens are found. use empty caption for {img_path} / キャプションファイルもclass tokenも見つかりませんでした。空のキャプションを使用します: {img_path}"
-                    )
-                    captions.append("")
-                    missing_captions.append(img_path)
-                else:
-                    if cap_for_img is None:
-                        captions.append(subset.class_tokens)
+            if use_cached_meta:
+                captions = [x[1] for x in metas]
+                missing_captions = [x[0] for x in metas if x[1] == ""]
+            else:
+                # 画像ファイルごとにプロンプトを読み込み、もしあればそちらを使う
+                captions = []
+                missing_captions = []
+                for img_path in img_paths:
+                    cap_for_img = read_caption(img_path, subset.caption_extension, subset.enable_wildcard)
+                    if cap_for_img is None and subset.class_tokens is None:
+                        logger.warning(
+                            f"neither caption file nor class tokens are found. use empty caption for {img_path} / キャプションファイルもclass tokenも見つかりませんでした。空のキャプションを使用します: {img_path}"
+                        )
+                        captions.append("")
                         missing_captions.append(img_path)
                     else:
-                        captions.append(cap_for_img)
+                        if cap_for_img is None:
+                            captions.append(subset.class_tokens)
+                            missing_captions.append(img_path)
+                        else:
+                            captions.append(cap_for_img)
 
             self.set_tag_frequency(os.path.basename(subset.image_dir), captions)  # タグ頻度を記録
 
@@ -1520,7 +1539,21 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                         logger.warning(missing_caption + f"... and {remaining_missing_captions} more")
                         break
                     logger.warning(missing_caption)
-            return img_paths, captions
+
+            if cache_meta:
+                logger.info(f"cache metadata for {subset.image_dir}")
+                if sizes is None or sizes[0] is None:
+                    sizes = [self.get_image_size(img_path) for img_path in img_paths]
+                # [img_path, caption, resolution]
+                data = [
+                    (img_path, caption, " ".join(str(x) for x in size))
+                    for img_path, caption, size in zip(img_paths, captions, sizes)
+                ]
+                with open(f"{subset.image_dir}/dataset.txt", "w", encoding="utf-8") as f:
+                    f.write("\n".join(["<|##|>".join(x) for x in data]))
+                logger.info(f"cache metadata done for {subset.image_dir}")
+
+            return img_paths, captions, sizes
 
         logger.info("prepare images.")
         num_train_images = 0
@@ -1539,7 +1572,7 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 )
                 continue
 
-            img_paths, captions = load_dreambooth_dir(subset)
+            img_paths, captions, sizes = load_dreambooth_dir(subset)
             if len(img_paths) < 1:
                 logger.warning(
                     f"ignore subset with image_dir='{subset.image_dir}': no images found / 画像が見つからないためサブセットを無視します"
@@ -1551,8 +1584,10 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
             else:
                 num_train_images += subset.num_repeats * len(img_paths)
 
-            for img_path, caption in zip(img_paths, captions):
+            for img_path, caption, size in zip(img_paths, captions, sizes):
                 info = ImageInfo(img_path, subset.num_repeats, caption, subset.is_reg, img_path)
+                if size is not None:
+                    info.image_size = size
                 if subset.is_reg:
                     reg_infos.append((info, subset))
                 else:
@@ -3355,6 +3390,12 @@ def add_dataset_arguments(
     parser: argparse.ArgumentParser, support_dreambooth: bool, support_caption: bool, support_caption_dropout: bool
 ):
     # dataset common
+    parser.add_argument(
+        "--cache_meta", action="store_true"
+    )
+    parser.add_argument(
+        "--use_cached_meta", action="store_true"
+    )
     parser.add_argument(
         "--train_data_dir", type=str, default=None, help="directory for train images / 学習画像データのディレクトリ"
     )
diff --git a/requirements.txt b/requirements.txt
index 805f0501d..c7aeb6895 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,6 +15,8 @@ easygui==0.98.3
 toml==0.10.2
 voluptuous==0.13.1
 huggingface-hub==0.20.1
+# for Image utils
+imagesize==1.4.1
 # for BLIP captioning
 # requests==2.28.2
 # timm==0.6.12
diff --git a/train_network.py b/train_network.py
index 9e573d9f6..b42daba71 100644
--- a/train_network.py
+++ b/train_network.py
@@ -6,6 +6,7 @@
 import random
 import time
 import json
+import pickle
 from multiprocessing import Value
 import toml
 
@@ -23,7 +24,7 @@
 
 import library.train_util as train_util
 from library.train_util import (
-    DreamBoothDataset,
+    DreamBoothDataset, DatasetGroup
 )
 import library.config_util as config_util
 from library.config_util import (

From 025347214d761d63c5475fec83e11856f3cdbe9d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Mar 2024 18:09:32 +0900
Subject: [PATCH 049/132] refactor metadata caching for DreamBooth dataset

---
 cache_dataset_meta.py    | 103 ---------------------------------------
 docs/config_README-en.md |   4 ++
 docs/config_README-ja.md |   4 ++
 library/config_util.py   |  39 +++++++++------
 library/train_util.py    |  86 ++++++++++++++++++--------------
 train_network.py         |   8 +--
 6 files changed, 85 insertions(+), 159 deletions(-)
 delete mode 100644 cache_dataset_meta.py

diff --git a/cache_dataset_meta.py b/cache_dataset_meta.py
deleted file mode 100644
index 7e7d96d12..000000000
--- a/cache_dataset_meta.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import argparse
-import random
-
-from accelerate.utils import set_seed
-
-import library.train_util as train_util
-import library.config_util as config_util
-from library.config_util import (
-    ConfigSanitizer,
-    BlueprintGenerator,
-)
-import library.custom_train_functions as custom_train_functions
-from library.utils import setup_logging, add_logging_arguments
-
-setup_logging()
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-def make_dataset(args):
-    train_util.prepare_dataset_args(args, True)
-    setup_logging(args, reset=True)
-
-    use_dreambooth_method = args.in_json is None
-    use_user_config = args.dataset_config is not None
-
-    if args.seed is None:
-        args.seed = random.randint(0, 2**32)
-    set_seed(args.seed)
-
-    # データセットを準備する
-    if args.dataset_class is None:
-        blueprint_generator = BlueprintGenerator(
-            ConfigSanitizer(True, True, False, True)
-        )
-        if use_user_config:
-            logger.info(f"Loading dataset config from {args.dataset_config}")
-            user_config = config_util.load_user_config(args.dataset_config)
-            ignored = ["train_data_dir", "reg_data_dir", "in_json"]
-            if any(getattr(args, attr) is not None for attr in ignored):
-                logger.warning(
-                    "ignoring the following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
-                        ", ".join(ignored)
-                    )
-                )
-        else:
-            if use_dreambooth_method:
-                logger.info("Using DreamBooth method.")
-                user_config = {
-                    "datasets": [
-                        {
-                            "subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(
-                                args.train_data_dir, args.reg_data_dir
-                            )
-                        }
-                    ]
-                }
-            else:
-                logger.info("Training with captions.")
-                user_config = {
-                    "datasets": [
-                        {
-                            "subsets": [
-                                {
-                                    "image_dir": args.train_data_dir,
-                                    "metadata_file": args.in_json,
-                                }
-                            ]
-                        }
-                    ]
-                }
-
-        blueprint = blueprint_generator.generate(user_config, args, tokenizer=None)
-        train_dataset_group = config_util.generate_dataset_group_by_blueprint(
-            blueprint.dataset_group
-        )
-    else:
-        # use arbitrary dataset class
-        train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizer=None)
-    return train_dataset_group
-
-
-def setup_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser()
-    add_logging_arguments(parser)
-    train_util.add_dataset_arguments(parser, True, True, True)
-    train_util.add_training_arguments(parser, True)
-    config_util.add_config_arguments(parser)
-    custom_train_functions.add_custom_train_arguments(parser)
-    return parser
-
-
-if __name__ == "__main__":
-    parser = setup_parser()
-
-    args, unknown = parser.parse_known_args()
-    args = train_util.read_config_from_file(args, parser)
-    if args.max_token_length is None:
-        args.max_token_length = 75
-    args.cache_meta = True
-
-    dataset_group = make_dataset(args)
diff --git a/docs/config_README-en.md b/docs/config_README-en.md
index e99fde216..83bea329b 100644
--- a/docs/config_README-en.md
+++ b/docs/config_README-en.md
@@ -177,6 +177,7 @@ Options related to the configuration of DreamBooth subsets.
 | `image_dir` | `'C:\hoge'` | - | - | o (required) |
 | `caption_extension` | `".txt"` | o | o | o |
 | `class_tokens` | `"sks girl"` | - | - | o |
+| `cache_info` | `false` | o | o | o |
 | `is_reg` | `false` | - | - | o |
 
 Firstly, note that for `image_dir`, the path to the image files must be specified as being directly in the directory. Unlike the previous DreamBooth method, where images had to be placed in subdirectories, this is not compatible with that specification. Also, even if you name the folder something like "5_cat", the number of repeats of the image and the class name will not be reflected. If you want to set these individually, you will need to explicitly specify them using `num_repeats` and `class_tokens`.
@@ -187,6 +188,9 @@ Firstly, note that for `image_dir`, the path to the image files must be specifie
 * `class_tokens`
     * Sets the class tokens.
     * Only used during training when a corresponding caption file does not exist. The determination of whether or not to use it is made on a per-image basis. If `class_tokens` is not specified and a caption file is not found, an error will occur.
+* `cache_info`
+    * Specifies whether to cache the image size and caption. If not specified, it is set to `false`. The cache is saved in `metadata_cache.json` in `image_dir`.
+    * Caching speeds up the loading of the dataset after the first time. It is effective when dealing with thousands of images or more.
 * `is_reg`
     * Specifies whether the subset images are for normalization. If not specified, it is set to `false`, meaning that the images are not for normalization.
 
diff --git a/docs/config_README-ja.md b/docs/config_README-ja.md
index b57ae86a7..cc74c341b 100644
--- a/docs/config_README-ja.md
+++ b/docs/config_README-ja.md
@@ -173,6 +173,7 @@ DreamBooth 方式のサブセットの設定に関わるオプションです。
 | `image_dir` | `‘C:\hoge’` | - | - | o（必須） |
 | `caption_extension` | `".txt"` | o | o | o |
 | `class_tokens` | `“sks girl”` | - | - | o |
+| `cache_info` | `false` | o | o | o | 
 | `is_reg` | `false` | - | - | o |
 
 まず注意点として、 `image_dir` には画像ファイルが直下に置かれているパスを指定する必要があります。従来の DreamBooth の手法ではサブディレクトリに画像を置く必要がありましたが、そちらとは仕様に互換性がありません。また、`5_cat` のようなフォルダ名にしても、画像の繰り返し回数とクラス名は反映されません。これらを個別に設定したい場合、`num_repeats` と `class_tokens` で明示的に指定する必要があることに注意してください。
@@ -183,6 +184,9 @@ DreamBooth 方式のサブセットの設定に関わるオプションです。
 * `class_tokens`
     * クラストークンを設定します。
     * 画像に対応する caption ファイルが存在しない場合にのみ学習時に利用されます。利用するかどうかの判定は画像ごとに行います。`class_tokens` を指定しなかった場合に caption ファイルも見つからなかった場合にはエラーになります。
+* `cache_info`
+    * 画像サイズ、キャプションをキャッシュするかどうかを指定します。指定しなかった場合は `false` になります。キャッシュは `image_dir` に `metadata_cache.json` というファイル名で保存されます。
+    * キャッシュを行うと、二回目以降のデータセット読み込みが高速化されます。数千枚以上の画像を扱う場合には有効です。
 * `is_reg`
     * サブセットの画像が正規化用かどうかを指定します。指定しなかった場合は `false` として、つまり正規化画像ではないとして扱います。
 
diff --git a/library/config_util.py b/library/config_util.py
index 58ffa5f4d..e52b7fc02 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -41,12 +41,17 @@
     DatasetGroup,
 )
 from .utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 def add_config_arguments(parser: argparse.ArgumentParser):
-    parser.add_argument("--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル")
+    parser.add_argument(
+        "--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル"
+    )
 
 
 # TODO: inherit Params class in Subset, Dataset
@@ -80,6 +85,7 @@ class DreamBoothSubsetParams(BaseSubsetParams):
     is_reg: bool = False
     class_tokens: Optional[str] = None
     caption_extension: str = ".caption"
+    cache_info: bool = False
 
 
 @dataclass
@@ -91,6 +97,7 @@ class FineTuningSubsetParams(BaseSubsetParams):
 class ControlNetSubsetParams(BaseSubsetParams):
     conditioning_data_dir: str = None
     caption_extension: str = ".caption"
+    cache_info: bool = False
 
 
 @dataclass
@@ -111,8 +118,6 @@ class DreamBoothDatasetParams(BaseDatasetParams):
     bucket_reso_steps: int = 64
     bucket_no_upscale: bool = False
     prior_loss_weight: float = 1.0
-    cache_meta: bool = False
-    use_cached_meta: bool = False
 
 
 @dataclass
@@ -202,6 +207,7 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
     DB_SUBSET_ASCENDABLE_SCHEMA = {
         "caption_extension": str,
         "class_tokens": str,
+        "cache_info": bool,
     }
     DB_SUBSET_DISTINCT_SCHEMA = {
         Required("image_dir"): str,
@@ -214,6 +220,7 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
     }
     CN_SUBSET_ASCENDABLE_SCHEMA = {
         "caption_extension": str,
+        "cache_info": bool,
     }
     CN_SUBSET_DISTINCT_SCHEMA = {
         Required("image_dir"): str,
@@ -230,8 +237,6 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "min_bucket_reso": int,
         "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
         "network_multiplier": float,
-        "cache_meta": bool,
-        "use_cached_meta": bool,
     }
 
     # options handled by argparse but not handled by user config
@@ -366,7 +371,9 @@ def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) ->
             return self.argparse_config_validator(argparse_namespace)
         except MultipleInvalid:
             # XXX: this should be a bug
-            logger.error("Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。")
+            logger.error(
+                "Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。"
+            )
             raise
 
     # NOTE: value would be overwritten by latter dict if there is already the same key
@@ -551,11 +558,11 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
                     "    ",
                 )
 
-    logger.info(f'{info}')
+    logger.info(f"{info}")
 
     # make buckets first because it determines the length of dataset
     # and set the same seed for all datasets
-    seed = random.randint(0, 2**31) # actual seed is seed + epoch_no
+    seed = random.randint(0, 2**31)  # actual seed is seed + epoch_no
     for i, dataset in enumerate(datasets):
         logger.info(f"[Dataset {i}]")
         dataset.make_buckets()
@@ -642,13 +649,17 @@ def load_user_config(file: str) -> dict:
             with open(file, "r") as f:
                 config = json.load(f)
         except Exception:
-            logger.error(f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}")
+            logger.error(
+                f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
             raise
     elif file.name.lower().endswith(".toml"):
         try:
             config = toml.load(file)
         except Exception:
-            logger.error(f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}")
+            logger.error(
+                f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
             raise
     else:
         raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file}")
@@ -675,13 +686,13 @@ def load_user_config(file: str) -> dict:
     train_util.prepare_dataset_args(argparse_namespace, config_args.support_finetuning)
 
     logger.info("[argparse_namespace]")
-    logger.info(f'{vars(argparse_namespace)}')
+    logger.info(f"{vars(argparse_namespace)}")
 
     user_config = load_user_config(config_args.dataset_config)
 
     logger.info("")
     logger.info("[user_config]")
-    logger.info(f'{user_config}')
+    logger.info(f"{user_config}")
 
     sanitizer = ConfigSanitizer(
         config_args.support_dreambooth, config_args.support_finetuning, config_args.support_controlnet, config_args.support_dropout
@@ -690,10 +701,10 @@ def load_user_config(file: str) -> dict:
 
     logger.info("")
     logger.info("[sanitized_user_config]")
-    logger.info(f'{sanitized_user_config}')
+    logger.info(f"{sanitized_user_config}")
 
     blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
 
     logger.info("")
     logger.info("[blueprint]")
-    logger.info(f'{blueprint}')
+    logger.info(f"{blueprint}")
diff --git a/library/train_util.py b/library/train_util.py
index 58c0cc14b..743a1147b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -410,6 +410,7 @@ def __init__(
         is_reg: bool,
         class_tokens: Optional[str],
         caption_extension: str,
+        cache_info: bool,
         num_repeats,
         shuffle_caption,
         caption_separator: str,
@@ -458,6 +459,7 @@ def __init__(
         self.caption_extension = caption_extension
         if self.caption_extension and not self.caption_extension.startswith("."):
             self.caption_extension = "." + self.caption_extension
+        self.cache_info = cache_info
 
     def __eq__(self, other) -> bool:
         if not isinstance(other, DreamBoothSubset):
@@ -527,6 +529,7 @@ def __init__(
         image_dir: str,
         conditioning_data_dir: str,
         caption_extension: str,
+        cache_info: bool,
         num_repeats,
         shuffle_caption,
         caption_separator,
@@ -574,6 +577,7 @@ def __init__(
         self.caption_extension = caption_extension
         if self.caption_extension and not self.caption_extension.startswith("."):
             self.caption_extension = "." + self.caption_extension
+        self.cache_info = cache_info
 
     def __eq__(self, other) -> bool:
         if not isinstance(other, ControlNetSubset):
@@ -1410,6 +1414,8 @@ def get_item_for_caching(self, bucket, bucket_batch_size, image_index):
 
 
 class DreamBoothDataset(BaseDataset):
+    IMAGE_INFO_CACHE_FILE = "metadata_cache.json"
+
     def __init__(
         self,
         subsets: Sequence[DreamBoothSubset],
@@ -1425,8 +1431,6 @@ def __init__(
         bucket_no_upscale: bool,
         prior_loss_weight: float,
         debug_dataset: bool,
-        cache_meta: bool,
-        use_cached_meta: bool,
     ) -> None:
         super().__init__(tokenizer, max_token_length, resolution, network_multiplier, debug_dataset)
 
@@ -1486,25 +1490,36 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                 logger.warning(f"not directory: {subset.image_dir}")
                 return [], []
 
-            sizes = None
-            if use_cached_meta:
-                logger.info(f"using cached metadata: {subset.image_dir}/dataset.txt")
-                # [img_path, caption, resolution]
-                with open(f"{subset.image_dir}/dataset.txt", "r", encoding="utf-8") as f:
-                    metas = f.readlines()
-                metas = [x.strip().split("<|##|>") for x in metas]
-                sizes = [tuple(int(res) for res in x[2].split(" ")) for x in metas]
-            
-            if use_cached_meta:
-                img_paths = [x[0] for x in metas]
+            info_cache_file = os.path.join(subset.image_dir, self.IMAGE_INFO_CACHE_FILE)
+            use_cached_info_for_subset = subset.cache_info
+            if use_cached_info_for_subset:
+                logger.info(
+                    f"using cached image info for this subset / このサブセットで、キャッシュされた画像情報を使います: {info_cache_file}"
+                )
+                if not os.path.isfile(info_cache_file):
+                    logger.warning(
+                        f"image info file not found. You can ignore this warning if this is the first time to use this subset"
+                        + " / キャッシュファイルが見つかりませんでした。初回実行時はこの警告を無視してください: {metadata_file}"
+                    )
+                    use_cached_info_for_subset = False
+
+            if use_cached_info_for_subset:
+                # json: {`img_path`:{"caption": "caption...", "resolution": [width, height]}, ...}
+                with open(info_cache_file, "r", encoding="utf-8") as f:
+                    metas = json.load(f)
+                img_paths = list(metas.keys())
+                sizes = [meta["resolution"] for meta in metas.values()]
+
+                # we may need to check image size and existence of image files, but it takes time, so user should check it before training
             else:
                 img_paths = glob_images(subset.image_dir, "*")
-                sizes = [None]*len(img_paths)
+                sizes = [None] * len(img_paths)
+
             logger.info(f"found directory {subset.image_dir} contains {len(img_paths)} image files")
 
-            if use_cached_meta:
-                captions = [x[1] for x in metas]
-                missing_captions = [x[0] for x in metas if x[1] == ""]
+            if use_cached_info_for_subset:
+                captions = [meta["caption"] for meta in metas.values()]
+                missing_captions = [img_path for img_path, caption in zip(img_paths, captions) if caption is None or caption == ""]
             else:
                 # 画像ファイルごとにプロンプトを読み込み、もしあればそちらを使う
                 captions = []
@@ -1540,19 +1555,17 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
                         break
                     logger.warning(missing_caption)
 
-            if cache_meta:
-                logger.info(f"cache metadata for {subset.image_dir}")
-                if sizes is None or sizes[0] is None:
-                    sizes = [self.get_image_size(img_path) for img_path in img_paths]
-                # [img_path, caption, resolution]
-                data = [
-                    (img_path, caption, " ".join(str(x) for x in size))
-                    for img_path, caption, size in zip(img_paths, captions, sizes)
-                ]
-                with open(f"{subset.image_dir}/dataset.txt", "w", encoding="utf-8") as f:
-                    f.write("\n".join(["<|##|>".join(x) for x in data]))
-                logger.info(f"cache metadata done for {subset.image_dir}")
-
+            if not use_cached_info_for_subset and subset.cache_info:
+                logger.info(f"cache image info for / 画像情報をキャッシュします : {info_cache_file}")
+                sizes = [self.get_image_size(img_path) for img_path in tqdm(img_paths, desc="get image size")]
+                matas = {}
+                for img_path, caption, size in zip(img_paths, captions, sizes):
+                    matas[img_path] = {"caption": caption, "resolution": list(size)}
+                with open(info_cache_file, "w", encoding="utf-8") as f:
+                    json.dump(matas, f, ensure_ascii=False, indent=2)
+                logger.info(f"cache image info done for / 画像情報を出力しました : {info_cache_file}")
+
+            # if sizes are not set, image size will be read in make_buckets
             return img_paths, captions, sizes
 
         logger.info("prepare images.")
@@ -1873,7 +1886,8 @@ def __init__(
                 subset.image_dir,
                 False,
                 None,
-                subset.caption_extension,
+                subset.caption_extension, 
+                subset.cache_info,
                 subset.num_repeats,
                 subset.shuffle_caption,
                 subset.caption_separator,
@@ -3391,13 +3405,13 @@ def add_dataset_arguments(
 ):
     # dataset common
     parser.add_argument(
-        "--cache_meta", action="store_true"
-    )
-    parser.add_argument(
-        "--use_cached_meta", action="store_true"
+        "--train_data_dir", type=str, default=None, help="directory for train images / 学習画像データのディレクトリ"
     )
     parser.add_argument(
-        "--train_data_dir", type=str, default=None, help="directory for train images / 学習画像データのディレクトリ"
+        "--cache_info",
+        action="store_true",
+        help="cache meta information (caption and image size) for faster dataset loading. only available for DreamBooth"
+        + " / メタ情報（キャプションとサイズ）をキャッシュしてデータセット読み込みを高速化する。DreamBooth方式のみ有効",
     )
     parser.add_argument(
         "--shuffle_caption", action="store_true", help="shuffle separated caption / 区切られたcaptionの各要素をshuffleする"
diff --git a/train_network.py b/train_network.py
index b42daba71..7ae9283cb 100644
--- a/train_network.py
+++ b/train_network.py
@@ -6,7 +6,6 @@
 import random
 import time
 import json
-import pickle
 from multiprocessing import Value
 import toml
 
@@ -14,18 +13,15 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
-init_ipex()
 
-from torch.nn.parallel import DistributedDataParallel as DDP
+init_ipex()
 
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from library import model_util
 
 import library.train_util as train_util
-from library.train_util import (
-    DreamBoothDataset, DatasetGroup
-)
+from library.train_util import DreamBoothDataset
 import library.config_util as config_util
 from library.config_util import (
     ConfigSanitizer,

From 1648ade6da549c7def2e21f236453e7938c499cd Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Mar 2024 20:55:48 +0900
Subject: [PATCH 050/132] format by black

---
 library/config_util.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index eb652ecf3..ff4de0921 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -41,12 +41,17 @@
     DatasetGroup,
 )
 from .utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 def add_config_arguments(parser: argparse.ArgumentParser):
-    parser.add_argument("--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル")
+    parser.add_argument(
+        "--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル"
+    )
 
 
 # TODO: inherit Params class in Subset, Dataset
@@ -362,7 +367,9 @@ def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) ->
             return self.argparse_config_validator(argparse_namespace)
         except MultipleInvalid:
             # XXX: this should be a bug
-            logger.error("Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。")
+            logger.error(
+                "Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。"
+            )
             raise
 
     # NOTE: value would be overwritten by latter dict if there is already the same key
@@ -547,11 +554,11 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
                     "    ",
                 )
 
-    logger.info(f'{info}')
+    logger.info(f"{info}")
 
     # make buckets first because it determines the length of dataset
     # and set the same seed for all datasets
-    seed = random.randint(0, 2**31) # actual seed is seed + epoch_no
+    seed = random.randint(0, 2**31)  # actual seed is seed + epoch_no
     for i, dataset in enumerate(datasets):
         logger.info(f"[Dataset {i}]")
         dataset.make_buckets()
@@ -638,13 +645,17 @@ def load_user_config(file: str) -> dict:
             with open(file, "r") as f:
                 config = json.load(f)
         except Exception:
-            logger.error(f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}")
+            logger.error(
+                f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
             raise
     elif file.name.lower().endswith(".toml"):
         try:
             config = toml.load(file)
         except Exception:
-            logger.error(f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}")
+            logger.error(
+                f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}"
+            )
             raise
     else:
         raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file}")
@@ -671,13 +682,13 @@ def load_user_config(file: str) -> dict:
     train_util.prepare_dataset_args(argparse_namespace, config_args.support_finetuning)
 
     logger.info("[argparse_namespace]")
-    logger.info(f'{vars(argparse_namespace)}')
+    logger.info(f"{vars(argparse_namespace)}")
 
     user_config = load_user_config(config_args.dataset_config)
 
     logger.info("")
     logger.info("[user_config]")
-    logger.info(f'{user_config}')
+    logger.info(f"{user_config}")
 
     sanitizer = ConfigSanitizer(
         config_args.support_dreambooth, config_args.support_finetuning, config_args.support_controlnet, config_args.support_dropout
@@ -686,10 +697,10 @@ def load_user_config(file: str) -> dict:
 
     logger.info("")
     logger.info("[sanitized_user_config]")
-    logger.info(f'{sanitized_user_config}')
+    logger.info(f"{sanitized_user_config}")
 
     blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
 
     logger.info("")
     logger.info("[blueprint]")
-    logger.info(f'{blueprint}')
+    logger.info(f"{blueprint}")

From 9bbb28c3619a9ff86a51bdc7ea83584976840663 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Mar 2024 22:06:37 +0900
Subject: [PATCH 051/132] update PyTorch version and reorganize dependencies

---
 README-ja.md                    |  55 ++----------
 README.md                       | 149 ++++----------------------------
 docs/train_SDXL-en.md           |  84 ++++++++++++++++++
 requirements.txt                |   8 +-
 sdxl_minimal_inference.py       |  30 +++++--
 sdxl_train_textual_inversion.py |   1 -
 6 files changed, 136 insertions(+), 191 deletions(-)
 create mode 100644 docs/train_SDXL-en.md

diff --git a/README-ja.md b/README-ja.md
index 29c33a659..1d83c44f1 100644
--- a/README-ja.md
+++ b/README-ja.md
@@ -1,7 +1,3 @@
-SDXLがサポートされました。sdxlブランチはmainブランチにマージされました。リポジトリを更新したときにはUpgradeの手順を実行してください。また accelerate のバージョンが上がっていますので、accelerate config を再度実行してください。
-
-SDXL学習については[こちら](./README.md#sdxl-training)をご覧ください（英語です）。
-
 ## リポジトリについて
 Stable Diffusionの学習、画像生成、その他のスクリプトを入れたリポジトリです。
 
@@ -21,6 +17,7 @@ GUIやPowerShellスクリプトなど、より使いやすくする機能が[bma
 
 * [学習について、共通編](./docs/train_README-ja.md) : データ整備やオプションなど
     * [データセット設定](./docs/config_README-ja.md)
+* [SDXL学習](./docs/train_SDXL-en.md) （英語版）
 * [DreamBoothの学習について](./docs/train_db_README-ja.md)
 * [fine-tuningのガイド](./docs/fine_tune_README_ja.md):
 * [LoRAの学習について](./docs/train_network_README-ja.md)
@@ -44,9 +41,7 @@ PowerShellを使う場合、venvを使えるようにするためには以下の
 
 ## Windows環境でのインストール
 
-スクリプトはPyTorch 2.0.1でテストしています。PyTorch 1.12.1でも動作すると思われます。
-
-以下の例ではPyTorchは2.0.1／CUDA 11.8版をインストールします。CUDA 11.6版やPyTorch 1.12.1を使う場合は適宜書き換えください。
+スクリプトはPyTorch 2.1.1でテストしています。PyTorch 2.0.1、1.12.1でも動作すると思われます。
 
 （なお、python -m venv～の行で「python」とだけ表示された場合、py -m venv～のようにpythonをpyに変更してください。）
 
@@ -59,20 +54,20 @@ cd sd-scripts
 python -m venv venv
 .\venv\Scripts\activate
 
-pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 --index-url https://download.pytorch.org/whl/cu118
+pip install torch==2.1.1 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu118
 pip install --upgrade -r requirements.txt
-pip install xformers==0.0.20
+pip install xformers==0.0.23 --index-url https://download.pytorch.org/whl/cu118
 
 accelerate config
 ```
 
 コマンドプロンプトでも同一です。
 
-（注:``python -m venv venv`` のほうが ``python -m venv --system-site-packages venv`` より安全そうなため書き換えました。globalなpythonにパッケージがインストールしてあると、後者だといろいろと問題が起きます。）
+注：`bitsandbytes==0.43.0`、`prodigyopt==1.0`、`lion-pytorch==0.0.6` は `requirements.txt` に含まれるようになりました。他のバージョンを使う場合は適宜インストールしてください。
 
-accelerate configの質問には以下のように答えてください。（bf16で学習する場合、最後の質問にはbf16と答えてください。）
+この例では PyTorch および xfomers は2.1.1／CUDA 11.8版をインストールします。CUDA 12.1版やPyTorch 1.12.1を使う場合は適宜書き換えください。たとえば CUDA 12.1版の場合は `pip install torch==2.1.1 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu121` および `pip install xformers==0.0.23 --index-url https://download.pytorch.org/whl/cu121` としてください。
 
-※0.15.0から日本語環境では選択のためにカーソルキーを押すと落ちます（……）。数字キーの0、1、2……で選択できますので、そちらを使ってください。
+accelerate configの質問には以下のように答えてください。（bf16で学習する場合、最後の質問にはbf16と答えてください。）
 
 ```txt
 - This machine
@@ -87,41 +82,6 @@ accelerate configの質問には以下のように答えてください。（bf1
 ※場合によって ``ValueError: fp16 mixed precision requires a GPU`` というエラーが出ることがあるようです。この場合、6番目の質問（
 ``What GPU(s) (by id) should be used for training on this machine as a comma-separated list? [all]:``）に「0」と答えてください。（id `0`のGPUが使われます。）
 
-### オプション：`bitsandbytes`（8bit optimizer）を使う
-
-`bitsandbytes`はオプションになりました。Linuxでは通常通りpipでインストールできます（0.41.1または以降のバージョンを推奨）。
-
-Windowsでは0.35.0または0.41.1を推奨します。
-
-- `bitsandbytes` 0.35.0: 安定しているとみられるバージョンです。AdamW8bitは使用できますが、他のいくつかの8bit optimizer、学習時の`full_bf16`オプションは使用できません。
-- `bitsandbytes` 0.41.1: Lion8bit、PagedAdamW8bit、PagedLion8bitをサポートします。`full_bf16`が使用できます。
-
-注：`bitsandbytes` 0.35.0から0.41.0までのバージョンには問題があるようです。 https://github.com/TimDettmers/bitsandbytes/issues/659
-
-以下の手順に従い、`bitsandbytes`をインストールしてください。
-
-### 0.35.0を使う場合
-
-PowerShellの例です。コマンドプロンプトではcpの代わりにcopyを使ってください。
-
-```powershell
-cd sd-scripts
-.\venv\Scripts\activate
-pip install bitsandbytes==0.35.0
-
-cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
-cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
-cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
-```
-
-### 0.41.1を使う場合
-
-jllllll氏の配布されている[こちら](https://github.com/jllllll/bitsandbytes-windows-webui) または他の場所から、Windows用のwhlファイルをインストールしてください。
-
-```powershell
-python -m pip install bitsandbytes==0.41.1 --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui
-```
-
 ## アップグレード
 
 新しいリリースがあった場合、以下のコマンドで更新できます。
@@ -151,4 +111,3 @@ Conv2d 3x3への拡大は [cloneofsimo氏](https://github.com/cloneofsimo/lora)
 
 [BLIP](https://github.com/salesforce/BLIP): BSD-3-Clause
 
-
diff --git a/README.md b/README.md
index a19f7968a..ef26acab8 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
-__SDXL is now supported. The sdxl branch has been merged into the main branch. If you update the repository, please follow the upgrade instructions. Also, the version of accelerate has been updated, so please run accelerate config again.__ The documentation for SDXL training is [here](./README.md#sdxl-training).
-
 This repository contains training, generation and utility scripts for Stable Diffusion.
 
 [__Change History__](#change-history) is moved to the bottom of the page. 
@@ -20,9 +18,9 @@ This repository contains the scripts for:
 
 ## About requirements.txt
 
-These files do not contain requirements for PyTorch. Because the versions of them depend on your environment. Please install PyTorch at first (see installation guide below.) 
+The file does not contain requirements for PyTorch. Because the version of PyTorch depends on the environment, it is not included in the file. Please install PyTorch first according to the environment. See installation instructions below.
 
-The scripts are tested with Pytorch 2.0.1. 1.12.1 is not tested but should work.
+The scripts are tested with Pytorch 2.1.1. 2.0.1 and 1.12.1 is not tested but should work.
 
 ## Links to usage documentation
 
@@ -32,12 +30,13 @@ Most of the documents are written in Japanese.
 
 * [Training guide - common](./docs/train_README-ja.md) : data preparation, options etc... 
   * [Chinese version](./docs/train_README-zh.md)
+* [SDXL training](./docs/train_SDXL-en.md) (English version)
 * [Dataset config](./docs/config_README-ja.md) 
   * [English version](./docs/config_README-en.md)
 * [DreamBooth training guide](./docs/train_db_README-ja.md)
 * [Step by Step fine-tuning guide](./docs/fine_tune_README_ja.md):
-* [training LoRA](./docs/train_network_README-ja.md)
-* [training Textual Inversion](./docs/train_ti_README-ja.md)
+* [Training LoRA](./docs/train_network_README-ja.md)
+* [Training Textual Inversion](./docs/train_ti_README-ja.md)
 * [Image generation](./docs/gen_img_README-ja.md)
 * note.com [Model conversion](https://note.com/kohya_ss/n/n374f316fe4ad)
 
@@ -65,14 +64,18 @@ cd sd-scripts
 python -m venv venv
 .\venv\Scripts\activate
 
-pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 --index-url https://download.pytorch.org/whl/cu118
+pip install torch==2.1.1 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu118
 pip install --upgrade -r requirements.txt
-pip install xformers==0.0.20
+pip install xformers==0.0.23 --index-url https://download.pytorch.org/whl/cu118
 
 accelerate config
 ```
 
-__Note:__ Now bitsandbytes is optional. Please install any version of bitsandbytes as needed. Installation instructions are in the following section.
+If `python -m venv` shows only `python`, change `python` to `py`.
+
+__Note:__ Now `bitsandbytes==0.43.0`, `prodigyopt==1.0` and `lion-pytorch==0.0.6` are included in the requirements.txt. If you'd like to use the another version, please install it manually.
+
+This installation is for CUDA 11.8. If you use a different version of CUDA, please install the appropriate version of PyTorch and xformers. For example, if you use CUDA 12, please install `pip install torch==2.1.1 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu121` and `pip install xformers==0.0.23 --index-url https://download.pytorch.org/whl/cu121`.
 
 <!-- 
 cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
@@ -91,48 +94,13 @@ Answers to accelerate config:
 - fp16
 ```
 
-note: Some user reports ``ValueError: fp16 mixed precision requires a GPU`` is occurred in training. In this case, answer `0` for the 6th question: 
+If you'd like to use bf16, please answer `bf16` to the last question.
+
+Note: Some user reports ``ValueError: fp16 mixed precision requires a GPU`` is occurred in training. In this case, answer `0` for the 6th question: 
 ``What GPU(s) (by id) should be used for training on this machine as a comma-separated list? [all]:`` 
 
 (Single GPU with id `0` will be used.)
 
-### Optional: Use `bitsandbytes` (8bit optimizer)
-
-For 8bit optimizer, you need to install `bitsandbytes`. For Linux, please install `bitsandbytes` as usual (0.41.1 or later is recommended.)
-
-For Windows, there are several versions of `bitsandbytes`:
-
-- `bitsandbytes` 0.35.0: Stable version. AdamW8bit is available. `full_bf16` is not available.
-- `bitsandbytes` 0.41.1: Lion8bit, PagedAdamW8bit and PagedLion8bit are available. `full_bf16` is available.
-
-Note: `bitsandbytes`above 0.35.0 till 0.41.0 seems to have an issue: https://github.com/TimDettmers/bitsandbytes/issues/659
-
-Follow the instructions below to install `bitsandbytes` for Windows.
-
-### bitsandbytes 0.35.0 for Windows
-
-Open a regular Powershell terminal and type the following inside:
-
-```powershell
-cd sd-scripts
-.\venv\Scripts\activate
-pip install bitsandbytes==0.35.0
-
-cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
-cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
-cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
-```
-
-This will install `bitsandbytes` 0.35.0 and copy the necessary files to the `bitsandbytes` directory.
-
-### bitsandbytes 0.41.1 for Windows
-
-Install the Windows version whl file from [here](https://github.com/jllllll/bitsandbytes-windows-webui) or other sources, like:
-
-```powershell
-python -m pip install bitsandbytes==0.41.1 --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui
-```
-
 ## Upgrade
 
 When a new release comes out you can upgrade your repo with the following command:
@@ -163,91 +131,6 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 [BLIP](https://github.com/salesforce/BLIP): BSD-3-Clause
 
 
-## SDXL training
-
-The documentation in this section will be moved to a separate document later.
-
-### Training scripts for SDXL
-
-- `sdxl_train.py` is a script for SDXL fine-tuning. The usage is almost the same as `fine_tune.py`, but it also supports DreamBooth dataset.
-  - `--full_bf16` option is added. Thanks to KohakuBlueleaf!
-    - This option enables the full bfloat16 training (includes gradients). This option is useful to reduce the GPU memory usage. 
-    - The full bfloat16 training might be unstable. Please use it at your own risk.
-  - The different learning rates for each U-Net block are now supported in sdxl_train.py. Specify with `--block_lr` option. Specify 23 values separated by commas like `--block_lr 1e-3,1e-3 ... 1e-3`.
-    - 23 values correspond to `0: time/label embed, 1-9: input blocks 0-8, 10-12: mid blocks 0-2, 13-21: output blocks 0-8, 22: out`.
-- `prepare_buckets_latents.py` now supports SDXL fine-tuning.
-
-- `sdxl_train_network.py` is a script for LoRA training for SDXL. The usage is almost the same as `train_network.py`.
-
-- Both scripts has following additional options:
-  - `--cache_text_encoder_outputs` and `--cache_text_encoder_outputs_to_disk`: Cache the outputs of the text encoders. This option is useful to reduce the GPU memory usage. This option cannot be used with options for shuffling or dropping the captions.
-  - `--no_half_vae`: Disable the half-precision (mixed-precision) VAE. VAE for SDXL seems to produce NaNs in some cases. This option is useful to avoid the NaNs.
-
-- `--weighted_captions` option is not supported yet for both scripts.
-
-- `sdxl_train_textual_inversion.py` is a script for Textual Inversion training for SDXL. The usage is almost the same as `train_textual_inversion.py`.
-  - `--cache_text_encoder_outputs` is not supported.
-  - There are two options for captions:
-    1. Training with captions. All captions must include the token string. The token string is replaced with multiple tokens.
-    2. Use `--use_object_template` or `--use_style_template` option. The captions are generated from the template. The existing captions are ignored.
-  - See below for the format of the embeddings.
-
-- `--min_timestep` and `--max_timestep` options are added to each training script. These options can be used to train U-Net with different timesteps. The default values are 0 and 1000.
-
-### Utility scripts for SDXL
-
-- `tools/cache_latents.py` is added. This script can be used to cache the latents to disk in advance. 
-  - The options are almost the same as `sdxl_train.py'. See the help message for the usage.
-  - Please launch the script as follows:
-    `accelerate launch  --num_cpu_threads_per_process 1 tools/cache_latents.py ...`
-  - This script should work with multi-GPU, but it is not tested in my environment.
-
-- `tools/cache_text_encoder_outputs.py` is added. This script can be used to cache the text encoder outputs to disk in advance. 
-  - The options are almost the same as `cache_latents.py` and `sdxl_train.py`. See the help message for the usage.
-
-- `sdxl_gen_img.py` is added. This script can be used to generate images with SDXL, including LoRA, Textual Inversion and ControlNet-LLLite. See the help message for the usage.
-
-### Tips for SDXL training
-
-- The default resolution of SDXL is 1024x1024.
-- The fine-tuning can be done with 24GB GPU memory with the batch size of 1. For 24GB GPU, the following options are recommended __for the fine-tuning with 24GB GPU memory__:
-  - Train U-Net only.
-  - Use gradient checkpointing.
-  - Use `--cache_text_encoder_outputs` option and caching latents.
-  - Use Adafactor optimizer. RMSprop 8bit or Adagrad 8bit may work. AdamW 8bit doesn't seem to work.
-- The LoRA training can be done with 8GB GPU memory (10GB recommended). For reducing the GPU memory usage, the following options are recommended:
-  - Train U-Net only.
-  - Use gradient checkpointing.
-  - Use `--cache_text_encoder_outputs` option and caching latents.
-  - Use one of 8bit optimizers or Adafactor optimizer.
-  - Use lower dim (4 to 8 for 8GB GPU).
-- `--network_train_unet_only` option is highly recommended for SDXL LoRA. Because SDXL has two text encoders, the result of the training will be unexpected.
-- PyTorch 2 seems to use slightly less GPU memory than PyTorch 1.
-- `--bucket_reso_steps` can be set to 32 instead of the default value 64. Smaller values than 32 will not work for SDXL training.
-
-Example of the optimizer settings for Adafactor with the fixed learning rate:
-```toml
-optimizer_type = "adafactor"
-optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]
-lr_scheduler = "constant_with_warmup"
-lr_warmup_steps = 100
-learning_rate = 4e-7 # SDXL original learning rate
-```
-
-### Format of Textual Inversion embeddings for SDXL
-
-```python
-from safetensors.torch import save_file
-
-state_dict = {"clip_g": embs_for_text_encoder_1280, "clip_l": embs_for_text_encoder_768}
-save_file(state_dict, file)
-```
-
-### ControlNet-LLLite
-
-ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [documentation](./docs/train_lllite_README.md) for details.
-
-
 ## Change History
 
 ### Working in progress
@@ -362,6 +245,8 @@ We would like to express our deep gratitude to Mark Saint (cacoe) from leonardo.
 Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
 最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
 
+## Additional Information
+
 ### Naming of LoRA
 
 The LoRA supported by `train_network.py` has been named to avoid confusion. The documentation has been updated. The following are the names of LoRA types in this repository.
diff --git a/docs/train_SDXL-en.md b/docs/train_SDXL-en.md
new file mode 100644
index 000000000..a4c55b3fd
--- /dev/null
+++ b/docs/train_SDXL-en.md
@@ -0,0 +1,84 @@
+## SDXL training
+
+The documentation will be moved to the training documentation in the future. The following is a brief explanation of the training scripts for SDXL.
+
+### Training scripts for SDXL
+
+- `sdxl_train.py` is a script for SDXL fine-tuning. The usage is almost the same as `fine_tune.py`, but it also supports DreamBooth dataset.
+  - `--full_bf16` option is added. Thanks to KohakuBlueleaf!
+    - This option enables the full bfloat16 training (includes gradients). This option is useful to reduce the GPU memory usage. 
+    - The full bfloat16 training might be unstable. Please use it at your own risk.
+  - The different learning rates for each U-Net block are now supported in sdxl_train.py. Specify with `--block_lr` option. Specify 23 values separated by commas like `--block_lr 1e-3,1e-3 ... 1e-3`.
+    - 23 values correspond to `0: time/label embed, 1-9: input blocks 0-8, 10-12: mid blocks 0-2, 13-21: output blocks 0-8, 22: out`.
+- `prepare_buckets_latents.py` now supports SDXL fine-tuning.
+
+- `sdxl_train_network.py` is a script for LoRA training for SDXL. The usage is almost the same as `train_network.py`.
+
+- Both scripts has following additional options:
+  - `--cache_text_encoder_outputs` and `--cache_text_encoder_outputs_to_disk`: Cache the outputs of the text encoders. This option is useful to reduce the GPU memory usage. This option cannot be used with options for shuffling or dropping the captions.
+  - `--no_half_vae`: Disable the half-precision (mixed-precision) VAE. VAE for SDXL seems to produce NaNs in some cases. This option is useful to avoid the NaNs.
+
+- `--weighted_captions` option is not supported yet for both scripts.
+
+- `sdxl_train_textual_inversion.py` is a script for Textual Inversion training for SDXL. The usage is almost the same as `train_textual_inversion.py`.
+  - `--cache_text_encoder_outputs` is not supported.
+  - There are two options for captions:
+    1. Training with captions. All captions must include the token string. The token string is replaced with multiple tokens.
+    2. Use `--use_object_template` or `--use_style_template` option. The captions are generated from the template. The existing captions are ignored.
+  - See below for the format of the embeddings.
+
+- `--min_timestep` and `--max_timestep` options are added to each training script. These options can be used to train U-Net with different timesteps. The default values are 0 and 1000.
+
+### Utility scripts for SDXL
+
+- `tools/cache_latents.py` is added. This script can be used to cache the latents to disk in advance. 
+  - The options are almost the same as `sdxl_train.py'. See the help message for the usage.
+  - Please launch the script as follows:
+    `accelerate launch  --num_cpu_threads_per_process 1 tools/cache_latents.py ...`
+  - This script should work with multi-GPU, but it is not tested in my environment.
+
+- `tools/cache_text_encoder_outputs.py` is added. This script can be used to cache the text encoder outputs to disk in advance. 
+  - The options are almost the same as `cache_latents.py` and `sdxl_train.py`. See the help message for the usage.
+
+- `sdxl_gen_img.py` is added. This script can be used to generate images with SDXL, including LoRA, Textual Inversion and ControlNet-LLLite. See the help message for the usage.
+
+### Tips for SDXL training
+
+- The default resolution of SDXL is 1024x1024.
+- The fine-tuning can be done with 24GB GPU memory with the batch size of 1. For 24GB GPU, the following options are recommended __for the fine-tuning with 24GB GPU memory__:
+  - Train U-Net only.
+  - Use gradient checkpointing.
+  - Use `--cache_text_encoder_outputs` option and caching latents.
+  - Use Adafactor optimizer. RMSprop 8bit or Adagrad 8bit may work. AdamW 8bit doesn't seem to work.
+- The LoRA training can be done with 8GB GPU memory (10GB recommended). For reducing the GPU memory usage, the following options are recommended:
+  - Train U-Net only.
+  - Use gradient checkpointing.
+  - Use `--cache_text_encoder_outputs` option and caching latents.
+  - Use one of 8bit optimizers or Adafactor optimizer.
+  - Use lower dim (4 to 8 for 8GB GPU).
+- `--network_train_unet_only` option is highly recommended for SDXL LoRA. Because SDXL has two text encoders, the result of the training will be unexpected.
+- PyTorch 2 seems to use slightly less GPU memory than PyTorch 1.
+- `--bucket_reso_steps` can be set to 32 instead of the default value 64. Smaller values than 32 will not work for SDXL training.
+
+Example of the optimizer settings for Adafactor with the fixed learning rate:
+```toml
+optimizer_type = "adafactor"
+optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]
+lr_scheduler = "constant_with_warmup"
+lr_warmup_steps = 100
+learning_rate = 4e-7 # SDXL original learning rate
+```
+
+### Format of Textual Inversion embeddings for SDXL
+
+```python
+from safetensors.torch import save_file
+
+state_dict = {"clip_g": embs_for_text_encoder_1280, "clip_l": embs_for_text_encoder_768}
+save_file(state_dict, file)
+```
+
+### ControlNet-LLLite
+
+ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [documentation](./docs/train_lllite_README.md) for details.
+
diff --git a/requirements.txt b/requirements.txt
index 805f0501d..51085744e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,8 +6,10 @@ ftfy==6.1.1
 opencv-python==4.7.0.68
 einops==0.7.0
 pytorch-lightning==1.9.0
-# bitsandbytes==0.39.1
-tensorboard==2.10.1
+bitsandbytes==0.43.0
+prodigyopt==1.0
+lion-pytorch==0.0.6
+tensorboard
 safetensors==0.4.2
 # gradio==3.16.2
 altair==4.2.2
@@ -31,7 +33,7 @@ huggingface-hub==0.20.1
 # this is for onnx: 
 # protobuf==3.20.3
 # open clip for SDXL
-open-clip-torch==2.20.0
+# open-clip-torch==2.20.0
 # For logging
 rich==13.7.0
 # for kohya_ss library
diff --git a/sdxl_minimal_inference.py b/sdxl_minimal_inference.py
index 084735665..a1e93b7f0 100644
--- a/sdxl_minimal_inference.py
+++ b/sdxl_minimal_inference.py
@@ -11,20 +11,24 @@
 
 import torch
 from library.device_utils import init_ipex, get_preferred_device
+
 init_ipex()
 
 from tqdm import tqdm
 from transformers import CLIPTokenizer
 from diffusers import EulerDiscreteScheduler
 from PIL import Image
-import open_clip
+
+# import open_clip
 from safetensors.torch import load_file
 
 from library import model_util, sdxl_model_util
 import networks.lora as lora
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 # scheduler: このあたりの設定はSD1/2と同じでいいらしい
@@ -154,12 +158,13 @@ def get_timestep_embedding(x, outdim):
     text_model2.eval()
 
     unet.set_use_memory_efficient_attention(True, False)
-    if torch.__version__ >= "2.0.0": # PyTorch 2.0.0 以上対応のxformersなら以下が使える
+    if torch.__version__ >= "2.0.0":  # PyTorch 2.0.0 以上対応のxformersなら以下が使える
         vae.set_use_memory_efficient_attention_xformers(True)
 
     # Tokenizers
     tokenizer1 = CLIPTokenizer.from_pretrained(text_encoder_1_name)
-    tokenizer2 = lambda x: open_clip.tokenize(x, context_length=77)
+    # tokenizer2 = lambda x: open_clip.tokenize(x, context_length=77)
+    tokenizer2 = CLIPTokenizer.from_pretrained(text_encoder_2_name)
 
     # LoRA
     for weights_file in args.lora_weights:
@@ -192,7 +197,9 @@ def generate_image(prompt, prompt2, negative_prompt, seed=None):
             emb3 = get_timestep_embedding(torch.FloatTensor([target_height, target_width]).unsqueeze(0), 256)
             # logger.info("emb1", emb1.shape)
             c_vector = torch.cat([emb1, emb2, emb3], dim=1).to(DEVICE, dtype=DTYPE)
-            uc_vector = c_vector.clone().to(DEVICE, dtype=DTYPE)  # ちょっとここ正しいかどうかわからない I'm not sure if this is right
+            uc_vector = c_vector.clone().to(
+                DEVICE, dtype=DTYPE
+            )  # ちょっとここ正しいかどうかわからない I'm not sure if this is right
 
             # crossattn
 
@@ -215,13 +222,22 @@ def call_text_encoder(text, text2):
                 # text_embedding = pipe.text_encoder.text_model.final_layer_norm(text_embedding)    # layer normは通さないらしい
 
             # text encoder 2
-            with torch.no_grad():
-                tokens = tokenizer2(text2).to(DEVICE)
+            # tokens = tokenizer2(text2).to(DEVICE)
+            tokens = tokenizer2(
+                text,
+                truncation=True,
+                return_length=True,
+                return_overflowing_tokens=False,
+                padding="max_length",
+                return_tensors="pt",
+            )
+            tokens = batch_encoding["input_ids"].to(DEVICE)
 
+            with torch.no_grad():
                 enc_out = text_model2(tokens, output_hidden_states=True, return_dict=True)
                 text_embedding2_penu = enc_out["hidden_states"][-2]
                 # logger.info("hidden_states2", text_embedding2_penu.shape)
-                text_embedding2_pool = enc_out["text_embeds"]   # do not support Textual Inversion
+                text_embedding2_pool = enc_out["text_embeds"]  # do not support Textual Inversion
 
             # 連結して終了 concat and finish
             text_embedding = torch.cat([text_embedding1, text_embedding2_penu], dim=2)
diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py
index b9a948bb2..257d181ad 100644
--- a/sdxl_train_textual_inversion.py
+++ b/sdxl_train_textual_inversion.py
@@ -7,7 +7,6 @@
 from library.device_utils import init_ipex
 init_ipex()
 
-import open_clip
 from library import sdxl_model_util, sdxl_train_util, train_util
 
 import train_textual_inversion

From 9c4492b58a7fa84126cca5f5ff39d6731865b0b5 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 24 Mar 2024 23:17:25 +0900
Subject: [PATCH 052/132] fix pytorch version 2.1.1 to 2.1.2

---
 README-ja.md | 8 ++++----
 README.md    | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/README-ja.md b/README-ja.md
index 1d83c44f1..f70f882d7 100644
--- a/README-ja.md
+++ b/README-ja.md
@@ -41,7 +41,7 @@ PowerShellを使う場合、venvを使えるようにするためには以下の
 
 ## Windows環境でのインストール
 
-スクリプトはPyTorch 2.1.1でテストしています。PyTorch 2.0.1、1.12.1でも動作すると思われます。
+スクリプトはPyTorch 2.1.2でテストしています。PyTorch 2.0.1、1.12.1でも動作すると思われます。
 
 （なお、python -m venv～の行で「python」とだけ表示された場合、py -m venv～のようにpythonをpyに変更してください。）
 
@@ -54,9 +54,9 @@ cd sd-scripts
 python -m venv venv
 .\venv\Scripts\activate
 
-pip install torch==2.1.1 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu118
+pip install torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu118
 pip install --upgrade -r requirements.txt
-pip install xformers==0.0.23 --index-url https://download.pytorch.org/whl/cu118
+pip install xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118
 
 accelerate config
 ```
@@ -65,7 +65,7 @@ accelerate config
 
 注：`bitsandbytes==0.43.0`、`prodigyopt==1.0`、`lion-pytorch==0.0.6` は `requirements.txt` に含まれるようになりました。他のバージョンを使う場合は適宜インストールしてください。
 
-この例では PyTorch および xfomers は2.1.1／CUDA 11.8版をインストールします。CUDA 12.1版やPyTorch 1.12.1を使う場合は適宜書き換えください。たとえば CUDA 12.1版の場合は `pip install torch==2.1.1 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu121` および `pip install xformers==0.0.23 --index-url https://download.pytorch.org/whl/cu121` としてください。
+この例では PyTorch および xfomers は2.1.2／CUDA 11.8版をインストールします。CUDA 12.1版やPyTorch 1.12.1を使う場合は適宜書き換えください。たとえば CUDA 12.1版の場合は `pip install torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu121` および `pip install xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu121` としてください。
 
 accelerate configの質問には以下のように答えてください。（bf16で学習する場合、最後の質問にはbf16と答えてください。）
 
diff --git a/README.md b/README.md
index ef26acab8..b1b924ec2 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ This repository contains the scripts for:
 
 The file does not contain requirements for PyTorch. Because the version of PyTorch depends on the environment, it is not included in the file. Please install PyTorch first according to the environment. See installation instructions below.
 
-The scripts are tested with Pytorch 2.1.1. 2.0.1 and 1.12.1 is not tested but should work.
+The scripts are tested with Pytorch 2.1.2. 2.0.1 and 1.12.1 is not tested but should work.
 
 ## Links to usage documentation
 
@@ -64,9 +64,9 @@ cd sd-scripts
 python -m venv venv
 .\venv\Scripts\activate
 
-pip install torch==2.1.1 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu118
+pip install torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu118
 pip install --upgrade -r requirements.txt
-pip install xformers==0.0.23 --index-url https://download.pytorch.org/whl/cu118
+pip install xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118
 
 accelerate config
 ```
@@ -75,7 +75,7 @@ If `python -m venv` shows only `python`, change `python` to `py`.
 
 __Note:__ Now `bitsandbytes==0.43.0`, `prodigyopt==1.0` and `lion-pytorch==0.0.6` are included in the requirements.txt. If you'd like to use the another version, please install it manually.
 
-This installation is for CUDA 11.8. If you use a different version of CUDA, please install the appropriate version of PyTorch and xformers. For example, if you use CUDA 12, please install `pip install torch==2.1.1 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu121` and `pip install xformers==0.0.23 --index-url https://download.pytorch.org/whl/cu121`.
+This installation is for CUDA 11.8. If you use a different version of CUDA, please install the appropriate version of PyTorch and xformers. For example, if you use CUDA 12, please install `pip install torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu121` and `pip install xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu121`.
 
 <!-- 
 cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\

From a2b85316275e6a683fa70204ec192401776c2dc0 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 25 Mar 2024 22:28:46 +0900
Subject: [PATCH 053/132] make each script consistent, fix to work w/o
 DeepSpeed

---
 fine_tune.py                         | 7 ++++++-
 library/train_util.py                | 2 +-
 sdxl_train_control_net_lllite.py     | 7 ++++---
 sdxl_train_control_net_lllite_old.py | 7 ++++---
 train_controlnet.py                  | 4 +++-
 train_db.py                          | 2 +-
 train_network.py                     | 7 +++----
 train_textual_inversion.py           | 8 +++++---
 train_textual_inversion_XTI.py       | 4 +++-
 9 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index e2f2c8a9a..a0350ce18 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -328,7 +328,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             with accelerator.accumulate(*training_models):
                 with torch.no_grad():
                     if "latents" in batch and batch["latents"] is not None:
-                        latents = batch["latents"].to(accelerator.device)  # .to(dtype=weight_dtype)
+                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
                     else:
                         # latentに変換
                         latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(weight_dtype)
@@ -507,6 +507,11 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="learning rate for text encoder, default is same as unet / Text Encoderの学習率、デフォルトはunetと同じ",
     )
+    parser.add_argument(
+        "--no_half_vae",
+        action="store_true",
+        help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
+    )
 
     return parser
 
diff --git a/library/train_util.py b/library/train_util.py
index e10e38daf..6ca5d559e 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -20,7 +20,7 @@
     Tuple,
     Union,
 )
-from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs
+from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs, PartialState
 import glob
 import math
 import os
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index e99b4e35c..9eaaa19f2 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -22,7 +22,7 @@
 import accelerate
 from diffusers import DDPMScheduler, ControlNetModel
 from safetensors.torch import load_file
-from library import sai_model_spec, sdxl_model_util, sdxl_original_unet, sdxl_train_util
+from library import deepspeed_utils, sai_model_spec, sdxl_model_util, sdxl_original_unet, sdxl_train_util
 
 import library.model_util as model_util
 import library.train_util as train_util
@@ -394,10 +394,10 @@ def remove_model(old_ckpt_name):
             with accelerator.accumulate(unet):
                 with torch.no_grad():
                     if "latents" in batch and batch["latents"] is not None:
-                        latents = batch["latents"].to(accelerator.device)
+                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
                     else:
                         # latentに変換
-                        latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample()
+                        latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(dtype=weight_dtype)
 
                         # NaNが含まれていれば警告を表示し0に置き換える
                         if torch.any(torch.isnan(latents)):
@@ -566,6 +566,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, False, True, True)
     train_util.add_training_arguments(parser, False)
+    deepspeed_utils.add_deepspeed_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser)
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index dac56eedd..e55a58896 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -18,7 +18,7 @@
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler, ControlNetModel
 from safetensors.torch import load_file
-from library import sai_model_spec, sdxl_model_util, sdxl_original_unet, sdxl_train_util
+from library import deepspeed_utils, sai_model_spec, sdxl_model_util, sdxl_original_unet, sdxl_train_util
 
 import library.model_util as model_util
 import library.train_util as train_util
@@ -361,10 +361,10 @@ def remove_model(old_ckpt_name):
             with accelerator.accumulate(network):
                 with torch.no_grad():
                     if "latents" in batch and batch["latents"] is not None:
-                        latents = batch["latents"].to(accelerator.device)
+                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
                     else:
                         # latentに変換
-                        latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample()
+                        latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(dtype=weight_dtype)
 
                         # NaNが含まれていれば警告を表示し0に置き換える
                         if torch.any(torch.isnan(latents)):
@@ -534,6 +534,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, False, True, True)
     train_util.add_training_arguments(parser, False)
+    deepspeed_utils.add_deepspeed_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser)
diff --git a/train_controlnet.py b/train_controlnet.py
index e44f08853..0cb0405fd 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -11,6 +11,7 @@
 from tqdm import tqdm
 
 import torch
+from library import deepspeed_utils
 from library.device_utils import init_ipex, clean_memory_on_device
 init_ipex()
 
@@ -396,7 +397,7 @@ def remove_model(old_ckpt_name):
             with accelerator.accumulate(controlnet):
                 with torch.no_grad():
                     if "latents" in batch and batch["latents"] is not None:
-                        latents = batch["latents"].to(accelerator.device)
+                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
                     else:
                         # latentに変換
                         latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
@@ -584,6 +585,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, False, True, True)
     train_util.add_training_arguments(parser, False)
+    deepspeed_utils.add_deepspeed_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser)
diff --git a/train_db.py b/train_db.py
index e27bc7ab4..bc7ef4ed5 100644
--- a/train_db.py
+++ b/train_db.py
@@ -319,7 +319,7 @@ def train(args):
                 with torch.no_grad():
                     # latentに変換
                     if cache_latents:
-                        latents = batch["latents"].to(accelerator.device)
+                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
                     else:
                         latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
                     latents = latents * 0.18215
diff --git a/train_network.py b/train_network.py
index a74a3caf1..f2451f412 100644
--- a/train_network.py
+++ b/train_network.py
@@ -471,8 +471,7 @@ def train(self, args):
             vae.to(accelerator.device, dtype=vae_dtype)
 
         # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
-        if args.full_fp16 and not args.deepspeed:
-            # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
+        if args.full_fp16:
             train_util.patch_accelerator_for_fp16_training(accelerator)
 
         # resumeする
@@ -781,11 +780,11 @@ def remove_model(old_ckpt_name):
                     on_step_start(text_encoder, unet)
 
                     if "latents" in batch and batch["latents"] is not None:
-                        latents = batch["latents"].to(accelerator.device)
+                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
                     else:
                         with torch.no_grad():
                             # latentに変換
-                            latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample()
+                            latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(dtype=weight_dtype)
 
                             # NaNが含まれていれば警告を表示し0に置き換える
                             if torch.any(torch.isnan(latents)):
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 0266bc143..72e26a450 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -8,12 +8,13 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from transformers import CLIPTokenizer
-from library import model_util
+from library import deepspeed_utils, model_util
 
 import library.train_util as train_util
 import library.huggingface_util as huggingface_util
@@ -558,10 +559,10 @@ def remove_model(old_ckpt_name):
                 with accelerator.accumulate(text_encoders[0]):
                     with torch.no_grad():
                         if "latents" in batch and batch["latents"] is not None:
-                            latents = batch["latents"].to(accelerator.device)
+                            latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
                         else:
                             # latentに変換
-                            latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample()
+                            latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(dtype=weight_dtype)
                         latents = latents * self.vae_scale_factor
 
                     # Get the text embedding for conditioning
@@ -749,6 +750,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, False)
     train_util.add_training_arguments(parser, True)
+    deepspeed_utils.add_deepspeed_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser, False)
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index ad7c267eb..00b251ba9 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -8,6 +8,7 @@
 from tqdm import tqdm
 
 import torch
+from library import deepspeed_utils
 from library.device_utils import init_ipex, clean_memory_on_device
 init_ipex()
 
@@ -439,7 +440,7 @@ def remove_model(old_ckpt_name):
             with accelerator.accumulate(text_encoder):
                 with torch.no_grad():
                     if "latents" in batch and batch["latents"] is not None:
-                        latents = batch["latents"].to(accelerator.device)
+                        latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
                     else:
                         # latentに変換
                         latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
@@ -662,6 +663,7 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_models_arguments(parser)
     train_util.add_dataset_arguments(parser, True, True, False)
     train_util.add_training_arguments(parser, True)
+    deepspeed_utils.add_deepspeed_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
     custom_train_functions.add_custom_train_arguments(parser, False)

From 6c08e97e1fc2ae284a38a463049d9456624fd1e9 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Tue, 26 Mar 2024 20:48:08 +0900
Subject: [PATCH 054/132] update readme

---
 README.md | 107 ++++++++++++++----------------------------------------
 1 file changed, 28 insertions(+), 79 deletions(-)

diff --git a/README.md b/README.md
index d9b2ddc44..155cc048f 100644
--- a/README.md
+++ b/README.md
@@ -133,21 +133,18 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ## Change History
 
-### Masked loss
+### Mar XX, 2024 / 2024/3/XX: v0.8.6
 
-`train_network.py`, `sdxl_train_network.py` and `sdxl_train.py` now support the masked loss. `--masked_loss` option is added. 
-
-NOTE: `train_network.py` and `sdxl_train.py` are not tested yet.
-
-ControlNet dataset is used to specify the mask. The mask images should be the RGB images. The pixel value 255 in R channel is treated as the mask (the loss is calculated only for the pixels with the mask), and 0 is treated as the non-mask. See details for the dataset specification in the [LLLite documentation](./docs/train_lllite_README.md#preparing-the-dataset).
-
-
-### Working in progress
 
+- The dependent libraries are updated. Please see [Upgrade](#upgrade) and update the libraries.
+  - Especially `imagesize` is newly added, so if you cannot update immediately, please install with `pip install imagesize==1.4.1`.
+  - `bitsandbytes==0.43.0`, `prodigyopt==1.0`, `lion-pytorch==0.0.6` are included in the requirements.txt.
 - Colab seems to stop with log output. Try specifying `--console_log_simple` option in the training script to disable rich logging.
 - The `.toml` file for the dataset config is now read in UTF-8 encoding. PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Thanks to Horizon1704!
 - Fixed a bug that the last subset settings are applied to all images when multiple subsets of regularization images are specified in the dataset settings. The settings for each subset are correctly applied to each image. PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) Thanks to feffy380!
 - `train_network.py` and `sdxl_train_network.py` are modified to record some dataset settings in the metadata of the trained model (`caption_prefix`, `caption_suffix`, `keep_tokens_separator`, `secondary_separator`, `enable_wildcard`).
+- DeepSpeed is supported. PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101)  and [#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) Thanks to BootsofLagrangian! See PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) for details.
+- The masked loss is supported in each training script. PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) See [Masked loss](#masked-loss) for details.
 - Some features are added to the dataset subset settings.
   - `secondary_separator` is added to specify the tag separator that is not the target of shuffling or dropping. 
     - Specify `secondary_separator=";;;"`. When you specify `secondary_separator`, the part is not shuffled or dropped. 
@@ -155,6 +152,7 @@ ControlNet dataset is used to specify the mask. The mask images should be the RG
   - `keep_tokens_separator` is updated to be used twice in the caption. When you specify `keep_tokens_separator="|||"`, the part divided by the second `|||` is not shuffled or dropped and remains at the end.
   - The existing features `caption_prefix` and `caption_suffix` can be used together. `caption_prefix` and `caption_suffix` are processed first, and then `enable_wildcard`, `keep_tokens_separator`, shuffling and dropping, and `secondary_separator` are processed in order.
   - See [Dataset config](./docs/config_README-en.md) for details.
+- The dataset with DreamBooth method supports caching image information (size, caption). PR [#1178](https://github.com/kohya-ss/sd-scripts/pull/1178) and [#1206](https://github.com/kohya-ss/sd-scripts/pull/1206) Thanks to KohakuBlueleaf! See [DreamBooth method specific options](./docs/config_README-en.md#dreambooth-specific-options) for details.
 - The support for v3 repositories is added to `tag_image_by_wd14_tagger.py` (`--onnx` option only). PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) Thanks to sdbds!
   - Onnx may need to be updated. Onnx is not installed by default, so please install or update it with `pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` etc. Please also check the comments in `requirements.txt`.
 - The model is now saved in the subdirectory as `--repo_id` in `tag_image_by_wd14_tagger.py` . This caches multiple repo_id models. Please delete unnecessary files under `--model_dir`.
@@ -163,17 +161,22 @@ ControlNet dataset is used to specify the mask. The mask images should be the RG
 - The options `--sample_every_n_epochs` and `--sample_every_n_steps` in each training script now display a warning and ignore them when a number less than or equal to `0` is specified. Thanks to S-Del for raising the issue.
 - The [English version of the dataset settings documentation](./docs/config_README-en.md) is added. PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) Thanks to darkstorm2150!
 
-
+- 依存ライブラリが更新されました。[アップグレード](./README-ja.md#アップグレード) を参照しライブラリを更新してください。
+  - 特に `imagesize` が新しく追加されていますので、すぐに更新ができない場合は `pip install imagesize==1.4.1` でインストールしてください。
+  - `bitsandbytes==0.43.0`、`prodigyopt==1.0`、`lion-pytorch==0.0.6` が requirements.txt に含まれるようになりました。
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
 - データセット設定の `.toml` ファイルが UTF-8 encoding で読み込まれるようになりました。PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Horizon1704 氏に感謝します。
 - データセット設定で、正則化画像のサブセットを複数指定した時、最後のサブセットの各種設定がすべてのサブセットの画像に適用される不具合が修正されました。それぞれのサブセットの設定が、それぞれの画像に正しく適用されます。PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) feffy380 氏に感謝します。
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
+- DeepSpeed がサポートされました。PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) 、[#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) BootsofLagrangian 氏に感謝します。詳細は PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) をご覧ください。
+- 各学習スクリプトでマスクロスをサポートしました。PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) 詳細は [Masked loss](#masked-loss) をご覧ください。
 - データセットのサブセット設定にいくつかの機能を追加しました。
   - シャッフルの対象とならないタグ分割識別子の指定 `secondary_separator` を追加しました。`secondary_separator=";;;"` のように指定します。`secondary_separator` で区切ることで、その部分はシャッフル、drop 時にまとめて扱われます。
   - `enable_wildcard` を追加しました。`true` にするとワイルドカード記法 `{aaa|bbb|ccc}` が使えます。また複数行キャプションも有効になります。
   - `keep_tokens_separator` をキャプション内に 2 つ使えるようにしました。たとえば `keep_tokens_separator="|||"` と指定したとき、`1girl, hatsune miku, vocaloid ||| stage, mic ||| best quality, rating: general` とキャプションを指定すると、二番目の `|||` で分割された部分はシャッフル、drop されず末尾に残ります。
   - 既存の機能 `caption_prefix` と `caption_suffix` とあわせて使えます。`caption_prefix` と `caption_suffix` は一番最初に処理され、その後、ワイルドカード、`keep_tokens_separator`、シャッフルおよび drop、`secondary_separator` の順に処理されます。
   - 詳細は [データセット設定](./docs/config_README-ja.md) をご覧ください。
+- DreamBooth 方式の DataSet で画像情報（サイズ、キャプション）をキャッシュする機能が追加されました。PR [#1178](https://github.com/kohya-ss/sd-scripts/pull/1178)、[#1206](https://github.com/kohya-ss/sd-scripts/pull/1206) KohakuBlueleaf 氏に感謝します。詳細は [データセット設定](./docs/config_README-ja.md#dreambooth-方式専用のオプション) をご覧ください。
 - `tag_image_by_wd14_tagger.py` で v3 のリポジトリがサポートされました（`--onnx` 指定時のみ有効）。 PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) sdbds 氏に感謝します。
   - Onnx のバージョンアップが必要になるかもしれません。デフォルトでは Onnx はインストールされていませんので、`pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` 等でインストール、アップデートしてください。`requirements.txt` のコメントもあわせてご確認ください。
 - `tag_image_by_wd14_tagger.py` で、モデルを`--repo_id` のサブディレクトリに保存するようにしました。これにより複数のモデルファイルがキャッシュされます。`--model_dir` 直下の不要なファイルは削除願います。
@@ -182,78 +185,24 @@ ControlNet dataset is used to specify the mask. The mask images should be the RG
 - 各学習スクリプトで `--sample_every_n_epochs` および `--sample_every_n_steps` オプションに `0` 以下の数値を指定した時、警告を表示するとともにそれらを無視するよう変更しました。問題提起していただいた S-Del 氏に感謝します。
 - データセット設定の[英語版ドキュメント](./docs/config_README-en.md) が追加されました。PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) darkstorm2150 氏に感謝します。
 
-
-### Mar 15, 2024 / 2024/3/15: v0.8.5
-
-- Fixed a bug that the value of timestep embedding during SDXL training was incorrect.
-  - Please update for SDXL training.
-  - The inference with the generation script is also fixed.
-  - This fix appears to resolve an issue where unintended artifacts occurred in trained models under certain conditions. 
-We would like to express our deep gratitude to Mark Saint (cacoe) from leonardo.ai, for reporting the issue and cooperating with the verification, and to gcem156 for the advice provided in identifying the part of the code that needed to be fixed.
-
-- SDXL 学習時の timestep embedding の値が誤っていたのを修正しました。
-  - SDXL の学習時にはアップデートをお願いいたします。
-  - 生成スクリプトでの推論時についてもあわせて修正しました。
-  - この修正により、特定の条件下で学習されたモデルに意図しないアーティファクトが発生する問題が解消されるようです。問題を報告いただき、また検証にご協力いただいた leonardo.ai の Mark Saint (cacoe) 氏、および修正点の特定に関するアドバイスをいただいた gcem156 氏に深く感謝いたします。
-
-### Feb 24, 2024 / 2024/2/24: v0.8.4
-
-- The log output has been improved. PR [#905](https://github.com/kohya-ss/sd-scripts/pull/905) Thanks to shirayu!
-  - The log is formatted by default. The `rich` library is required. Please see [Upgrade](#upgrade) and update the library.
-  - If `rich` is not installed, the log output will be the same as before.
-  - The following options are available in each training script:
-  - `--console_log_simple` option can be used to switch to the previous log output.
-  - `--console_log_level` option can be used to specify the log level. The default is `INFO`.
-  - `--console_log_file` option can be used to output the log to a file. The default is `None` (output to the console).
-- The sample image generation during multi-GPU training is now done with multiple GPUs. PR [#1061](https://github.com/kohya-ss/sd-scripts/pull/1061) Thanks to DKnight54!
-- The support for mps devices is improved. PR [#1054](https://github.com/kohya-ss/sd-scripts/pull/1054) Thanks to akx! If mps device exists instead of CUDA, the mps device is used automatically.
-- The `--new_conv_rank` option to specify the new rank of Conv2d is added to `networks/resize_lora.py`. PR [#1102](https://github.com/kohya-ss/sd-scripts/pull/1102) Thanks to mgz-dev!
-- An option `--highvram` to disable the optimization for environments with little VRAM is added to the training scripts. If you specify it when there is enough VRAM, the operation will be faster.
-  - Currently, only the cache part of latents is optimized.
-- The IPEX support is improved. PR [#1086](https://github.com/kohya-ss/sd-scripts/pull/1086) Thanks to Disty0!
-- Fixed a bug that `svd_merge_lora.py` crashes in some cases. PR [#1087](https://github.com/kohya-ss/sd-scripts/pull/1087) Thanks to mgz-dev!
-- DyLoRA is fixed to work with SDXL. PR [#1126](https://github.com/kohya-ss/sd-scripts/pull/1126) Thanks to tamlog06!
-- The common image generation script `gen_img.py` for SD 1/2 and SDXL is added. The basic functions are the same as the scripts for SD 1/2 and SDXL, but some new features are added.
-  - External scripts to generate prompts can be supported. It can be called with `--from_module` option. (The documentation will be added later)
-  - The normalization method after prompt weighting can be specified with `--emb_normalize_mode` option. `original` is the original method, `abs` is the normalization with the average of the absolute values, `none` is no normalization.
-- Gradual Latent Hires fix is added to each generation script. See [here](./docs/gen_img_README-ja.md#about-gradual-latent) for details.
-
-- ログ出力が改善されました。 PR [#905](https://github.com/kohya-ss/sd-scripts/pull/905) shirayu 氏に感謝します。
-  - デフォルトでログが成形されます。`rich` ライブラリが必要なため、[Upgrade](#upgrade) を参照し更新をお願いします。
-  - `rich` がインストールされていない場合は、従来のログ出力になります。
-  - 各学習スクリプトでは以下のオプションが有効です。
-  - `--console_log_simple` オプションで従来のログ出力に切り替えられます。
-  - `--console_log_level` でログレベルを指定できます。デフォルトは `INFO` です。
-  - `--console_log_file` でログファイルを出力できます。デフォルトは `None`（コンソールに出力） です。
-- 複数 GPU 学習時に学習中のサンプル画像生成を複数 GPU で行うようになりました。 PR [#1061](https://github.com/kohya-ss/sd-scripts/pull/1061) DKnight54 氏に感謝します。
-- mps デバイスのサポートが改善されました。 PR [#1054](https://github.com/kohya-ss/sd-scripts/pull/1054) akx 氏に感謝します。CUDA ではなく mps が存在する場合には自動的に mps デバイスを使用します。
-- `networks/resize_lora.py` に Conv2d の新しいランクを指定するオプション `--new_conv_rank` が追加されました。 PR [#1102](https://github.com/kohya-ss/sd-scripts/pull/1102) mgz-dev 氏に感謝します。
-- 学習スクリプトに VRAMが少ない環境向け最適化を無効にするオプション `--highvram` を追加しました。VRAM に余裕がある場合に指定すると動作が高速化されます。
-  - 現在は latents のキャッシュ部分のみ高速化されます。
-- IPEX サポートが改善されました。 PR [#1086](https://github.com/kohya-ss/sd-scripts/pull/1086) Disty0 氏に感謝します。
-- `svd_merge_lora.py` が場合によってエラーになる不具合が修正されました。 PR [#1087](https://github.com/kohya-ss/sd-scripts/pull/1087) mgz-dev 氏に感謝します。
-- DyLoRA が SDXL で動くよう修正されました。PR [#1126](https://github.com/kohya-ss/sd-scripts/pull/1126) tamlog06 氏に感謝します。
-- SD 1/2 および SDXL 共通の生成スクリプト `gen_img.py` を追加しました。基本的な機能は SD 1/2、SDXL 向けスクリプトと同じですが、いくつかの新機能が追加されています。
-  - プロンプトを動的に生成する外部スクリプトをサポートしました。 `--from_module` で呼び出せます。（ドキュメントはのちほど追加します）
-  - プロンプト重みづけ後の正規化方法を `--emb_normalize_mode` で指定できます。`original` は元の方法、`abs` は絶対値の平均値で正規化、`none` は正規化を行いません。
-- Gradual Latent Hires fix を各生成スクリプトに追加しました。詳細は [こちら](./docs/gen_img_README-ja.md#about-gradual-latent)。
-
-
-### Jan 27, 2024 / 2024/1/27: v0.8.3
-
-- Fixed a bug that the training crashes when `--fp8_base` is specified with `--save_state`. PR [#1079](https://github.com/kohya-ss/sd-scripts/pull/1079) Thanks to feffy380!
-  - `safetensors` is updated. Please see [Upgrade](#upgrade) and update the library.
-- Fixed a bug that the training crashes when `network_multiplier` is specified with multi-GPU training. PR [#1084](https://github.com/kohya-ss/sd-scripts/pull/1084) Thanks to fireicewolf!
-- Fixed a bug that the training crashes when training ControlNet-LLLite.
-
-- `--fp8_base` 指定時に `--save_state` での保存がエラーになる不具合が修正されました。 PR [#1079](https://github.com/kohya-ss/sd-scripts/pull/1079) feffy380 氏に感謝します。
-  - `safetensors` がバージョンアップされていますので、[Upgrade](#upgrade) を参照し更新をお願いします。
-- 複数 GPU での学習時に `network_multiplier` を指定するとクラッシュする不具合が修正されました。 PR [#1084](https://github.com/kohya-ss/sd-scripts/pull/1084) fireicewolf 氏に感謝します。
-- ControlNet-LLLite の学習がエラーになる不具合を修正しました。 
-
 Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
 最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
 
+#### Masked loss
+
+The masked loss is supported in each training script. To enable the masked loss, specify the `--masked_loss` option.
+
+The feature is not fully tested, so there may be bugs. If you find any issues, please open an Issue.
+
+ControlNet dataset is used to specify the mask. The mask images should be the RGB images. The pixel value 255 in R channel is treated as the mask (the loss is calculated only for the pixels with the mask), and 0 is treated as the non-mask. The pixel values 0-255 are converted to 0-1 (i.e., the pixel value 128 is treated as the half weight of the loss). See details for the dataset specification in the [LLLite documentation](./docs/train_lllite_README.md#preparing-the-dataset).
+
+各学習スクリプトでマスクロスをサポートしました。マスクロスを有効にするには `--masked_loss` オプションを指定してください。
+
+機能は完全にテストされていないため、不具合があるかもしれません。その場合は Issue を立てていただけると助かります。
+
+マスクの指定には ControlNet データセットを使用します。マスク画像は RGB 画像である必要があります。R チャンネルのピクセル値 255 がロス計算対象、0 がロス計算対象外になります。0-255 の値は、0-1 の範囲に変換されます（つまりピクセル値 128 の部分はロスの重みが半分になります）。データセットの詳細は [LLLite ドキュメント](./docs/train_lllite_README-ja.md#データセットの準備) をご覧ください。
+
+
 ## Additional Information
 
 ### Naming of LoRA

From 6f7e93d5cc4e4d879e1889af8061e350d86d61d1 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Wed, 27 Mar 2024 03:21:13 +0300
Subject: [PATCH 055/132] Add OpenVINO and ROCm ONNX Runtime for WD14

---
 finetune/tag_images_by_wd14_tagger.py | 23 +++++++++++++++++------
 library/ipex/attention.py             | 16 ++++++++--------
 library/ipex/hijacks.py               | 14 ++++++++------
 3 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index 401c6d1ec..c19ae9749 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -142,12 +142,23 @@ def main(args):
 
         del model
 
-        ort_sess = ort.InferenceSession(
-            onnx_path,
-            providers=(
-                ["CUDAExecutionProvider"] if "CUDAExecutionProvider" in ort.get_available_providers() else ["CPUExecutionProvider"]
-            ),
-        )
+        if "OpenVINOExecutionProvider" in ort.get_available_providers():
+            # requires provider options for gpu support
+            # fp16 causes nonsense outputs
+            ort_sess = ort.InferenceSession(
+                onnx_path,
+                providers=(["OpenVINOExecutionProvider"]),
+                provider_options=[{'device_type' : "GPU_FP32"}],
+            )
+        else:
+            ort_sess = ort.InferenceSession(
+                onnx_path,
+                providers=(
+                    ["CUDAExecutionProvider"] if "CUDAExecutionProvider" in ort.get_available_providers() else
+                    ["ROCMExecutionProvider"] if "ROCMExecutionProvider" in ort.get_available_providers() else
+                    ["CPUExecutionProvider"]
+                ),
+            )
     else:
         from tensorflow.keras.models import load_model
 
diff --git a/library/ipex/attention.py b/library/ipex/attention.py
index 8253c5b17..d989ad53d 100644
--- a/library/ipex/attention.py
+++ b/library/ipex/attention.py
@@ -122,15 +122,15 @@ def torch_bmm_32_bit(input, mat2, *, out=None):
                     mat2[start_idx:end_idx],
                     out=out
                 )
+        torch.xpu.synchronize(input.device)
     else:
         return original_torch_bmm(input, mat2, out=out)
-    torch.xpu.synchronize(input.device)
     return hidden_states
 
 original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
-def scaled_dot_product_attention_32_bit(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False):
+def scaled_dot_product_attention_32_bit(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, **kwargs):
     if query.device.type != "xpu":
-        return original_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal)
+        return original_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal, **kwargs)
     do_split, do_split_2, do_split_3, split_slice_size, split_2_slice_size, split_3_slice_size = find_sdpa_slice_sizes(query.shape, query.element_size())
 
     # Slice SDPA
@@ -153,7 +153,7 @@ def scaled_dot_product_attention_32_bit(query, key, value, attn_mask=None, dropo
                                 key[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3],
                                 value[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3],
                                 attn_mask=attn_mask[start_idx:end_idx, start_idx_2:end_idx_2, start_idx_3:end_idx_3] if attn_mask is not None else attn_mask,
-                                dropout_p=dropout_p, is_causal=is_causal
+                                dropout_p=dropout_p, is_causal=is_causal, **kwargs
                             )
                     else:
                         hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention(
@@ -161,7 +161,7 @@ def scaled_dot_product_attention_32_bit(query, key, value, attn_mask=None, dropo
                             key[start_idx:end_idx, start_idx_2:end_idx_2],
                             value[start_idx:end_idx, start_idx_2:end_idx_2],
                             attn_mask=attn_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask,
-                            dropout_p=dropout_p, is_causal=is_causal
+                            dropout_p=dropout_p, is_causal=is_causal, **kwargs
                         )
             else:
                 hidden_states[start_idx:end_idx] = original_scaled_dot_product_attention(
@@ -169,9 +169,9 @@ def scaled_dot_product_attention_32_bit(query, key, value, attn_mask=None, dropo
                     key[start_idx:end_idx],
                     value[start_idx:end_idx],
                     attn_mask=attn_mask[start_idx:end_idx] if attn_mask is not None else attn_mask,
-                    dropout_p=dropout_p, is_causal=is_causal
+                    dropout_p=dropout_p, is_causal=is_causal, **kwargs
                 )
+        torch.xpu.synchronize(query.device)
     else:
-        return original_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal)
-    torch.xpu.synchronize(query.device)
+        return original_scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal, **kwargs)
     return hidden_states
diff --git a/library/ipex/hijacks.py b/library/ipex/hijacks.py
index b1b9ccf0e..65089f39e 100644
--- a/library/ipex/hijacks.py
+++ b/library/ipex/hijacks.py
@@ -12,7 +12,7 @@
 class DummyDataParallel(torch.nn.Module): # pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods
     def __new__(cls, module, device_ids=None, output_device=None, dim=0): # pylint: disable=unused-argument
         if isinstance(device_ids, list) and len(device_ids) > 1:
-            logger.error("IPEX backend doesn't support DataParallel on multiple XPU devices")
+            print("IPEX backend doesn't support DataParallel on multiple XPU devices")
         return module.to("xpu")
 
 def return_null_context(*args, **kwargs): # pylint: disable=unused-argument
@@ -42,7 +42,7 @@ def autocast_init(self, device_type, dtype=None, enabled=True, cache_enabled=Non
 original_interpolate = torch.nn.functional.interpolate
 @wraps(torch.nn.functional.interpolate)
 def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments
-    if antialias or align_corners is not None:
+    if antialias or align_corners is not None or mode == 'bicubic':
         return_device = tensor.device
         return_dtype = tensor.dtype
         return original_interpolate(tensor.to("cpu", dtype=torch.float32), size=size, scale_factor=scale_factor, mode=mode,
@@ -216,7 +216,9 @@ def torch_empty(*args, device=None, **kwargs):
 
 original_torch_randn = torch.randn
 @wraps(torch.randn)
-def torch_randn(*args, device=None, **kwargs):
+def torch_randn(*args, device=None, dtype=None, **kwargs):
+    if dtype == bytes:
+        dtype = None
     if check_device(device):
         return original_torch_randn(*args, device=return_xpu(device), **kwargs)
     else:
@@ -256,11 +258,11 @@ def torch_Generator(device=None):
 
 original_torch_load = torch.load
 @wraps(torch.load)
-def torch_load(f, map_location=None, pickle_module=None, *, weights_only=False, mmap=None, **kwargs):
+def torch_load(f, map_location=None, *args, **kwargs):
     if check_device(map_location):
-        return original_torch_load(f, map_location=return_xpu(map_location), pickle_module=pickle_module, weights_only=weights_only, mmap=mmap, **kwargs)
+        return original_torch_load(f, map_location=return_xpu(map_location), *args, **kwargs)
     else:
-        return original_torch_load(f, map_location=map_location, pickle_module=pickle_module, weights_only=weights_only, mmap=mmap, **kwargs)
+        return original_torch_load(f, map_location=map_location, *args, **kwargs)
 
 
 # Hijack Functions:

From dd9763be31805f24255ca722f30bc5f6d99c73f5 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Wed, 27 Mar 2024 21:53:40 +0300
Subject: [PATCH 056/132] Rating support for WD Tagger

---
 finetune/tag_images_by_wd14_tagger.py | 33 ++++++++++++++++++---------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index c19ae9749..1d49afc7f 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -174,8 +174,9 @@ def main(args):
         rows = l[1:]
     assert header[0] == "tag_id" and header[1] == "name" and header[2] == "category", f"unexpected csv format: {header}"
 
-    general_tags = [row[1] for row in rows[1:] if row[2] == "0"]
-    character_tags = [row[1] for row in rows[1:] if row[2] == "4"]
+    rating_tags = [row[1] for row in rows[0:] if row[2] == "9"]
+    general_tags = [row[1] for row in rows[0:] if row[2] == "0"]
+    character_tags = [row[1] for row in rows[0:] if row[2] == "4"]
 
     # 画像を読み込む
 
@@ -202,17 +203,13 @@ def run_batch(path_imgs):
             probs = probs.numpy()
 
         for (image_path, _), prob in zip(path_imgs, probs):
-            # 最初の4つはratingなので無視する
-            # # First 4 labels are actually ratings: pick one with argmax
-            # ratings_names = label_names[:4]
-            # rating_index = ratings_names["probs"].argmax()
-            # found_rating = ratings_names[rating_index: rating_index + 1][["name", "probs"]]
+            combined_tags = []
+            rating_tag_text = ""
+            character_tag_text = ""
+            general_tag_text = ""
 
             # それ以降はタグなのでconfidenceがthresholdより高いものを追加する
             # Everything else is tags: pick any where prediction confidence > threshold
-            combined_tags = []
-            general_tag_text = ""
-            character_tag_text = ""
             for i, p in enumerate(prob[4:]):
                 if i < len(general_tags) and p >= args.general_threshold:
                     tag_name = general_tags[i]
@@ -231,7 +228,20 @@ def run_batch(path_imgs):
                     if tag_name not in undesired_tags:
                         tag_freq[tag_name] = tag_freq.get(tag_name, 0) + 1
                         character_tag_text += caption_separator + tag_name
-                        combined_tags.append(tag_name)
+                        combined_tags.insert(0,tag_name) # insert to the beggining
+
+            #最初の4つはratingなので無視する
+            # First 4 labels are actually ratings: pick one with argmax
+            ratings_names = prob[:4]
+            rating_index = ratings_names.argmax()
+            found_rating = rating_tags[rating_index]
+            if args.remove_underscore and len(found_rating) > 3:
+                found_rating = found_rating.replace("_", " ")
+
+            if found_rating not in undesired_tags:
+                tag_freq[found_rating] = tag_freq.get(found_rating, 0) + 1
+                rating_tag_text = found_rating
+                combined_tags.insert(0,found_rating) # insert to the beggining
 
             # 先頭のカンマを取る
             if len(general_tag_text) > 0:
@@ -264,6 +274,7 @@ def run_batch(path_imgs):
                 if args.debug:
                     logger.info("")
                     logger.info(f"{image_path}:")
+                    logger.info(f"\tRating tags: {rating_tag_text}")
                     logger.info(f"\tCharacter tags: {character_tag_text}")
                     logger.info(f"\tGeneral tags: {general_tag_text}")
 

From 954731d56402a463a71c0626cb22699bc4e43c3b Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Wed, 27 Mar 2024 22:00:59 +0300
Subject: [PATCH 057/132] fix typo

---
 finetune/tag_images_by_wd14_tagger.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index 1d49afc7f..4003210ed 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -228,7 +228,7 @@ def run_batch(path_imgs):
                     if tag_name not in undesired_tags:
                         tag_freq[tag_name] = tag_freq.get(tag_name, 0) + 1
                         character_tag_text += caption_separator + tag_name
-                        combined_tags.insert(0,tag_name) # insert to the beggining
+                        combined_tags.insert(0,tag_name) # insert to the beginning
 
             #最初の4つはratingなので無視する
             # First 4 labels are actually ratings: pick one with argmax
@@ -241,7 +241,7 @@ def run_batch(path_imgs):
             if found_rating not in undesired_tags:
                 tag_freq[found_rating] = tag_freq.get(found_rating, 0) + 1
                 rating_tag_text = found_rating
-                combined_tags.insert(0,found_rating) # insert to the beggining
+                combined_tags.insert(0,found_rating) # insert to the beginning
 
             # 先頭のカンマを取る
             if len(general_tag_text) > 0:

From 4012fd24f684d4d371a8736d7e65bee307077e33 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Thu, 28 Mar 2024 21:08:16 +0300
Subject: [PATCH 058/132] IPEX fix pin_memory

---
 library/ipex/__init__.py |  7 ++++---
 library/ipex/hijacks.py  | 17 +++++++++++++++--
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/library/ipex/__init__.py b/library/ipex/__init__.py
index 972a3bf63..e5aba693c 100644
--- a/library/ipex/__init__.py
+++ b/library/ipex/__init__.py
@@ -32,6 +32,7 @@ def ipex_init(): # pylint: disable=too-many-statements
             torch.cuda.FloatTensor = torch.xpu.FloatTensor
             torch.Tensor.cuda = torch.Tensor.xpu
             torch.Tensor.is_cuda = torch.Tensor.is_xpu
+            torch.nn.Module.cuda = torch.nn.Module.xpu
             torch.UntypedStorage.cuda = torch.UntypedStorage.xpu
             torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock
             torch.cuda._initialized = torch.xpu.lazy_init._initialized
@@ -147,9 +148,9 @@ def ipex_init(): # pylint: disable=too-many-statements
 
             # C
             torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream
-            ipex._C._DeviceProperties.multi_processor_count = ipex._C._DeviceProperties.gpu_eu_count
-            ipex._C._DeviceProperties.major = 2023
-            ipex._C._DeviceProperties.minor = 2
+            ipex._C._DeviceProperties.multi_processor_count = ipex._C._DeviceProperties.gpu_subslice_count
+            ipex._C._DeviceProperties.major = 2024
+            ipex._C._DeviceProperties.minor = 0
 
             # Fix functions with ipex:
             torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory]
diff --git a/library/ipex/hijacks.py b/library/ipex/hijacks.py
index 65089f39e..d3cef8276 100644
--- a/library/ipex/hijacks.py
+++ b/library/ipex/hijacks.py
@@ -190,6 +190,16 @@ def Tensor_cuda(self, device=None, *args, **kwargs):
     else:
         return original_Tensor_cuda(self, device, *args, **kwargs)
 
+original_Tensor_pin_memory = torch.Tensor.pin_memory
+@wraps(torch.Tensor.pin_memory)
+def Tensor_pin_memory(self, device=None, *args, **kwargs):
+    if device is None:
+        device = "xpu"
+    if check_device(device):
+        return original_Tensor_pin_memory(self, return_xpu(device), *args, **kwargs)
+    else:
+        return original_Tensor_pin_memory(self, device, *args, **kwargs)
+
 original_UntypedStorage_init = torch.UntypedStorage.__init__
 @wraps(torch.UntypedStorage.__init__)
 def UntypedStorage_init(*args, device=None, **kwargs):
@@ -259,10 +269,12 @@ def torch_Generator(device=None):
 original_torch_load = torch.load
 @wraps(torch.load)
 def torch_load(f, map_location=None, *args, **kwargs):
+    if map_location is None:
+        map_location = "xpu"
     if check_device(map_location):
-        return original_torch_load(f, map_location=return_xpu(map_location), *args, **kwargs)
+        return original_torch_load(f, *args, map_location=return_xpu(map_location), **kwargs)
     else:
-        return original_torch_load(f, map_location=map_location, *args, **kwargs)
+        return original_torch_load(f, *args, map_location=map_location, **kwargs)
 
 
 # Hijack Functions:
@@ -270,6 +282,7 @@ def ipex_hijacks():
     torch.tensor = torch_tensor
     torch.Tensor.to = Tensor_to
     torch.Tensor.cuda = Tensor_cuda
+    torch.Tensor.pin_memory = Tensor_pin_memory
     torch.UntypedStorage.__init__ = UntypedStorage_init
     torch.UntypedStorage.cuda = UntypedStorage_cuda
     torch.empty = torch_empty

From bc586ce190e1e85adcd7d9734636fac068bc929e Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Fri, 29 Mar 2024 13:56:42 +0300
Subject: [PATCH 059/132] Add --use_rating_tags and --character_tags_first for
 WD Tagger

---
 finetune/tag_images_by_wd14_tagger.py | 58 ++++++++++++++++++---------
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index 4003210ed..16a26179d 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -130,10 +130,10 @@ def main(args):
         input_name = model.graph.input[0].name
         try:
             batch_size = model.graph.input[0].type.tensor_type.shape.dim[0].dim_value
-        except:
+        except Exception:
             batch_size = model.graph.input[0].type.tensor_type.shape.dim[0].dim_param
 
-        if args.batch_size != batch_size and type(batch_size) != str and batch_size > 0:
+        if args.batch_size != batch_size and not isinstance(batch_size, str) and batch_size > 0:
             # some rebatch model may use 'N' as dynamic axes
             logger.warning(
                 f"Batch size {args.batch_size} doesn't match onnx model batch size {batch_size}, use model batch size {batch_size}"
@@ -169,9 +169,9 @@ def main(args):
 
     with open(os.path.join(model_location, CSV_FILE), "r", encoding="utf-8") as f:
         reader = csv.reader(f)
-        l = [row for row in reader]
-        header = l[0]  # tag_id,name,category,count
-        rows = l[1:]
+        line = [row for row in reader]
+        header = line[0]  # tag_id,name,category,count
+        rows = line[1:]
     assert header[0] == "tag_id" and header[1] == "name" and header[2] == "category", f"unexpected csv format: {header}"
 
     rating_tags = [row[1] for row in rows[0:] if row[2] == "9"]
@@ -228,20 +228,24 @@ def run_batch(path_imgs):
                     if tag_name not in undesired_tags:
                         tag_freq[tag_name] = tag_freq.get(tag_name, 0) + 1
                         character_tag_text += caption_separator + tag_name
-                        combined_tags.insert(0,tag_name) # insert to the beginning
+                        if args.character_tags_first: # insert to the beginning
+                            combined_tags.insert(0,tag_name)
+                        else:
+                            combined_tags.append(tag_name)
 
             #最初の4つはratingなので無視する
             # First 4 labels are actually ratings: pick one with argmax
-            ratings_names = prob[:4]
-            rating_index = ratings_names.argmax()
-            found_rating = rating_tags[rating_index]
-            if args.remove_underscore and len(found_rating) > 3:
-                found_rating = found_rating.replace("_", " ")
-
-            if found_rating not in undesired_tags:
-                tag_freq[found_rating] = tag_freq.get(found_rating, 0) + 1
-                rating_tag_text = found_rating
-                combined_tags.insert(0,found_rating) # insert to the beginning
+            if args.use_rating_tags:
+                ratings_names = prob[:4]
+                rating_index = ratings_names.argmax()
+                found_rating = rating_tags[rating_index]
+                if args.remove_underscore and len(found_rating) > 3:
+                    found_rating = found_rating.replace("_", " ")
+
+                if found_rating not in undesired_tags:
+                    tag_freq[found_rating] = tag_freq.get(found_rating, 0) + 1
+                    rating_tag_text = found_rating
+                    combined_tags.insert(0,found_rating) # insert to the beginning
 
             # 先頭のカンマを取る
             if len(general_tag_text) > 0:
@@ -332,7 +336,9 @@ def run_batch(path_imgs):
 
 def setup_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser()
-    parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+    parser.add_argument(
+        "train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ"
+    )
     parser.add_argument(
         "--repo_id",
         type=str,
@@ -350,7 +356,9 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="force downloading wd14 tagger models / wd14 taggerのモデルを再ダウンロードします",
     )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
+    parser.add_argument(
+        "--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ"
+    )
     parser.add_argument(
         "--max_data_loader_n_workers",
         type=int,
@@ -389,7 +397,9 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="replace underscores with spaces in the output tags / 出力されるタグのアンダースコアをスペースに置き換える",
     )
-    parser.add_argument("--debug", action="store_true", help="debug mode")
+    parser.add_argument(
+        "--debug", action="store_true", help="debug mode"
+    )
     parser.add_argument(
         "--undesired_tags",
         type=str,
@@ -399,10 +409,18 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--frequency_tags", action="store_true", help="Show frequency of tags for images / 画像ごとのタグの出現頻度を表示する"
     )
-    parser.add_argument("--onnx", action="store_true", help="use onnx model for inference / onnxモデルを推論に使用する")
+    parser.add_argument(
+        "--onnx", action="store_true", help="use onnx model for inference / onnxモデルを推論に使用する"
+    )
     parser.add_argument(
         "--append_tags", action="store_true", help="Append captions instead of overwriting / 上書きではなくキャプションを追記する"
     )
+    parser.add_argument(
+        "--use_rating_tags", action="store_true", help="Adds rating tags as the first tag",
+    )
+    parser.add_argument(
+        "--character_tags_first", action="store_true", help="Always inserts character tags before the general tags",
+    )
     parser.add_argument(
         "--caption_separator",
         type=str,

From f1f30ab4188223081aa96329a75bc4a99672b411 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 30 Mar 2024 14:57:39 +0900
Subject: [PATCH 060/132] fix to work with num_beams>1 closes #1149

---
 finetune/blip/blip.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/finetune/blip/blip.py b/finetune/blip/blip.py
index 7d192cb26..13b69ffd3 100644
--- a/finetune/blip/blip.py
+++ b/finetune/blip/blip.py
@@ -134,8 +134,9 @@ def forward(self, image, caption):
     def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0):
         image_embeds = self.visual_encoder(image)
 
-        if not sample:
-            image_embeds = image_embeds.repeat_interleave(num_beams,dim=0)
+        # recent version of transformers seems to do repeat_interleave automatically
+        # if not sample:
+        #     image_embeds = image_embeds.repeat_interleave(num_beams,dim=0)
             
         image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
         model_kwargs = {"encoder_hidden_states": image_embeds, "encoder_attention_mask":image_atts}

From 434dc408f9bd08a587f525e28a57b9781d10ce3c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 30 Mar 2024 17:12:36 +0900
Subject: [PATCH 061/132] update readme

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 155cc048f..34b11c70a 100644
--- a/README.md
+++ b/README.md
@@ -156,6 +156,7 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 - The support for v3 repositories is added to `tag_image_by_wd14_tagger.py` (`--onnx` option only). PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) Thanks to sdbds!
   - Onnx may need to be updated. Onnx is not installed by default, so please install or update it with `pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` etc. Please also check the comments in `requirements.txt`.
 - The model is now saved in the subdirectory as `--repo_id` in `tag_image_by_wd14_tagger.py` . This caches multiple repo_id models. Please delete unnecessary files under `--model_dir`.
+- Fixed an error when specifying `--beam_search` and a value of 2 or more for `--num_beams` in `make_captions.py`.
 - The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
 - The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
 - The options `--sample_every_n_epochs` and `--sample_every_n_steps` in each training script now display a warning and ignore them when a number less than or equal to `0` is specified. Thanks to S-Del for raising the issue.
@@ -180,6 +181,7 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 - `tag_image_by_wd14_tagger.py` で v3 のリポジトリがサポートされました（`--onnx` 指定時のみ有効）。 PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) sdbds 氏に感謝します。
   - Onnx のバージョンアップが必要になるかもしれません。デフォルトでは Onnx はインストールされていませんので、`pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` 等でインストール、アップデートしてください。`requirements.txt` のコメントもあわせてご確認ください。
 - `tag_image_by_wd14_tagger.py` で、モデルを`--repo_id` のサブディレクトリに保存するようにしました。これにより複数のモデルファイルがキャッシュされます。`--model_dir` 直下の不要なファイルは削除願います。
+- `make_captions.py` で `--beam_search` を指定し `--num_beams` に2以上の値を指定した時のエラーを修正しました。
 - 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
 - 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
 - 各学習スクリプトで `--sample_every_n_epochs` および `--sample_every_n_steps` オプションに `0` 以下の数値を指定した時、警告を表示するとともにそれらを無視するよう変更しました。問題提起していただいた S-Del 氏に感謝します。

From cae5aa0a56c2407564306810fc2138d4aea804c5 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 30 Mar 2024 21:48:22 +0900
Subject: [PATCH 062/132] update wd14 tagger and doc

---
 README.md                             |  16 ++++
 docs/wd14_tagger_README-en.md         |  85 +++++++++++++++++++
 docs/wd14_tagger_README-ja.md         |  85 +++++++++++++++++++
 finetune/tag_images_by_wd14_tagger.py | 116 ++++++++++++++++++++------
 4 files changed, 277 insertions(+), 25 deletions(-)
 create mode 100644 docs/wd14_tagger_README-en.md
 create mode 100644 docs/wd14_tagger_README-ja.md

diff --git a/README.md b/README.md
index 34b11c70a..71614b9c6 100644
--- a/README.md
+++ b/README.md
@@ -156,6 +156,14 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 - The support for v3 repositories is added to `tag_image_by_wd14_tagger.py` (`--onnx` option only). PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) Thanks to sdbds!
   - Onnx may need to be updated. Onnx is not installed by default, so please install or update it with `pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` etc. Please also check the comments in `requirements.txt`.
 - The model is now saved in the subdirectory as `--repo_id` in `tag_image_by_wd14_tagger.py` . This caches multiple repo_id models. Please delete unnecessary files under `--model_dir`.
+- Some options are added to `tag_image_by_wd14_tagger.py`.
+  - Some are added in PR [#1216](https://github.com/kohya-ss/sd-scripts/pull/1216) Thanks to Disty0!
+  - Output rating tags `--use_rating_tags` and `--use_rating_tags_as_last_tag`
+  - Output character tags first `--character_tags_first`
+  - Expand character tags and series `--character_tag_expand`
+  - Specify tags to output first `--always_first_tags`
+  - Replace tags `--tag_replacement`
+  - See [Tagging documentation](./docs/wd14_tagger_README-en.md) for details.
 - Fixed an error when specifying `--beam_search` and a value of 2 or more for `--num_beams` in `make_captions.py`.
 - The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
 - The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
@@ -181,6 +189,14 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 - `tag_image_by_wd14_tagger.py` で v3 のリポジトリがサポートされました（`--onnx` 指定時のみ有効）。 PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) sdbds 氏に感謝します。
   - Onnx のバージョンアップが必要になるかもしれません。デフォルトでは Onnx はインストールされていませんので、`pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` 等でインストール、アップデートしてください。`requirements.txt` のコメントもあわせてご確認ください。
 - `tag_image_by_wd14_tagger.py` で、モデルを`--repo_id` のサブディレクトリに保存するようにしました。これにより複数のモデルファイルがキャッシュされます。`--model_dir` 直下の不要なファイルは削除願います。
+- `tag_image_by_wd14_tagger.py` にいくつかのオプションを追加しました。
+  - 一部は PR [#1216](https://github.com/kohya-ss/sd-scripts/pull/1216) で追加されました。Disty0 氏に感謝します。
+  - レーティングタグを出力する `--use_rating_tags` および `--use_rating_tags_as_last_tag`
+  - キャラクタタグを最初に出力する `--character_tags_first`
+  - キャラクタタグとシリーズを展開する `--character_tag_expand`
+  - 常に最初に出力するタグを指定する `--always_first_tags`
+  - タグを置換する `--tag_replacement`
+  - 詳細は [タグ付けに関するドキュメント](./docs/wd14_tagger_README-ja.md) をご覧ください。
 - `make_captions.py` で `--beam_search` を指定し `--num_beams` に2以上の値を指定した時のエラーを修正しました。
 - 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
 - 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
diff --git a/docs/wd14_tagger_README-en.md b/docs/wd14_tagger_README-en.md
new file mode 100644
index 000000000..7ac7f2ed0
--- /dev/null
+++ b/docs/wd14_tagger_README-en.md
@@ -0,0 +1,85 @@
+# Image Tagging using WD14Tagger
+
+This document is based on the information from this github page (https://github.com/toriato/stable-diffusion-webui-wd14-tagger#mrsmilingwolfs-model-aka-waifu-diffusion-14-tagger).
+
+Using onnx for inference is recommended. Please install onnx with the following command:
+
+```powershell
+pip install onnx==1.15.0 onnxruntime-gpu==1.17.1  
+```
+
+The model weights will be automatically downloaded from Hugging Face.
+
+# Usage
+
+Run the script to perform tagging.
+
+```powershell
+python finetune/tag_images_by_wd14_tagger.py --onnx --repo_id <model repo id> --batch_size <batch size> <training data folder>
+```
+
+For example, if using the repository `SmilingWolf/wd-swinv2-tagger-v3` with a batch size of 4, and the training data is located in the parent folder `train_data`, it would be:
+
+```powershell
+python tag_images_by_wd14_tagger.py --onnx --repo_id SmilingWolf/wd-swinv2-tagger-v3 --batch_size 4 ..\train_data
+```
+
+On the first run, the model files will be automatically downloaded to the `wd14_tagger_model` folder (the folder can be changed with an option). 
+
+Tag files will be created in the same directory as the training data images, with the same filename and a `.txt` extension.
+
+![Generated tag files](https://user-images.githubusercontent.com/52813779/208910534-ea514373-1185-4b7d-9ae3-61eb50bc294e.png)
+
+![Tags and image](https://user-images.githubusercontent.com/52813779/208910599-29070c15-7639-474f-b3e4-06bd5a3df29e.png)
+
+## Example
+
+To output in the Animagine XL 3.1 format, it would be as follows (enter on a single line in practice):
+
+```
+python tag_images_by_wd14_tagger.py --onnx --repo_id SmilingWolf/wd-swinv2-tagger-v3 
+    --batch_size 4  --remove_underscore --undesired_tags "PUT,YOUR,UNDESIRED,TAGS" --recursive 
+    --use_rating_tagss_as_last_tag --character_tags_first --character_tag_expand 
+    --always_first_tags "1girl,1boy"  ..\train_data
+```
+
+## Available Repository IDs
+
+[SmilingWolf's V2 and V3 models](https://huggingface.co/SmilingWolf) are available for use. Specify them in the format like `SmilingWolf/wd-vit-tagger-v3`. The default when omitted is `SmilingWolf/wd-v1-4-convnext-tagger-v2`.
+
+# Options 
+
+## General Options
+
+- `--onnx`: Use ONNX for inference. If not specified, TensorFlow will be used. If using TensorFlow, please install TensorFlow separately. 
+- `--batch_size`: Number of images to process at once. Default is 1. Adjust according to VRAM capacity.
+- `--caption_extension`: File extension for caption files. Default is `.txt`.
+- `--max_data_loader_n_workers`: Maximum number of workers for DataLoader. Specifying a value of 1 or more will use DataLoader to speed up image loading. If unspecified, DataLoader will not be used.
+- `--thresh`: Confidence threshold for outputting tags. Default is 0.35. Lowering the value will assign more tags but accuracy will decrease. 
+- `--general_threshold`: Confidence threshold for general tags. If omitted, same as `--thresh`.
+- `--character_threshold`: Confidence threshold for character tags. If omitted, same as `--thresh`.
+- `--recursive`: If specified, subfolders within the specified folder will also be processed recursively.
+- `--append_tags`: Append tags to existing tag files.
+- `--frequency_tags`: Output tag frequencies.  
+- `--debug`: Debug mode. Outputs debug information if specified.
+
+## Model Download
+
+- `--model_dir`: Folder to save model files. Default is `wd14_tagger_model`.  
+- `--force_download`: Re-download model files if specified.
+
+## Tag Editing
+
+- `--remove_underscore`: Remove underscores from output tags.
+- `--undesired_tags`: Specify tags not to output. Multiple tags can be specified, separated by commas. For example, `black eyes,black hair`.
+- `--use_rating_tags`: Output rating tags at the beginning of the tags.
+- `--use_rating_tags_as_last_tag`: Add rating tags at the end of the tags.
+- `--character_tags_first`: Output character tags first.
+- `--character_tag_expand`: Expand character tag series names. For example, split the tag `chara_name_(series)` into `chara_name, series`.  
+- `--always_first_tags`: Specify tags to always output first when a certain tag appears in an image. Multiple tags can be specified, separated by commas. For example, `1girl,1boy`.
+- `--caption_separator`: Separate tags with this string in the output file. Default is `, `.
+- `--tag_replacement`: Perform tag replacement. Specify in the format `tag1,tag2;tag3,tag4`. 
+
+When specifying `remove_underscore`, specify `undesired_tags`, `always_first_tags`, and `tag_replacement` without including underscores.
+
+When specifying `caption_separator`, separate `undesired_tags` and `always_first_tags` with `caption_separator`. Always separate `tag_replacement` with `,`.
diff --git a/docs/wd14_tagger_README-ja.md b/docs/wd14_tagger_README-ja.md
new file mode 100644
index 000000000..c75b77cbe
--- /dev/null
+++ b/docs/wd14_tagger_README-ja.md
@@ -0,0 +1,85 @@
+# WD14Taggerによるタグ付け
+
+こちらのgithubページ（https://github.com/toriato/stable-diffusion-webui-wd14-tagger#mrsmilingwolfs-model-aka-waifu-diffusion-14-tagger ）の情報を参考にさせていただきました。
+
+onnx を用いた推論を推奨します。以下のコマンドで onnx をインストールしてください。
+
+```powershell
+pip install onnx==1.15.0 onnxruntime-gpu==1.17.1
+```
+
+モデルの重みはHugging Faceから自動的にダウンロードしてきます。
+
+# 使い方
+
+スクリプトを実行してタグ付けを行います。
+```
+python fintune/tag_images_by_wd14_tagger.py --onnx --repo_id <モデルのrepo id> --batch_size <バッチサイズ> <教師データフォルダ>
+```
+
+レポジトリに `SmilingWolf/wd-swinv2-tagger-v3` を使用し、バッチサイズを4にして、教師データを親フォルダの `train_data`に置いた場合、以下のようになります。
+
+```
+python tag_images_by_wd14_tagger.py --onnx --repo_id SmilingWolf/wd-swinv2-tagger-v3 --batch_size 4 ..\train_data
+```
+
+初回起動時にはモデルファイルが `wd14_tagger_model` フォルダに自動的にダウンロードされます（フォルダはオプションで変えられます）。
+
+タグファイルが教師データ画像と同じディレクトリに、同じファイル名、拡張子.txtで作成されます。
+
+![生成されたタグファイル](https://user-images.githubusercontent.com/52813779/208910534-ea514373-1185-4b7d-9ae3-61eb50bc294e.png)
+
+![タグと画像](https://user-images.githubusercontent.com/52813779/208910599-29070c15-7639-474f-b3e4-06bd5a3df29e.png)
+
+## 記述例
+
+Animagine XL 3.1 方式で出力する場合、以下のようになります（実際には 1 行で入力してください）。
+
+```
+python tag_images_by_wd14_tagger.py --onnx --repo_id SmilingWolf/wd-swinv2-tagger-v3 
+    --batch_size 4  --remove_underscore --undesired_tags "PUT,YOUR,UNDESIRED,TAGS" --recursive 
+    --use_rating_tagss_as_last_tag --character_tags_first --character_tag_expand 
+    --always_first_tags "1girl,1boy"  ..\train_data
+```
+
+## 使用可能なリポジトリID
+
+[SmilingWolf 氏の V2、V3 のモデル](https://huggingface.co/SmilingWolf)が使用可能です。`SmilingWolf/wd-vit-tagger-v3` のように指定してください。省略時のデフォルトは `SmilingWolf/wd-v1-4-convnext-tagger-v2` です。
+
+# オプション
+
+## 一般オプション
+
+- `--onnx` : ONNX を使用して推論します。指定しない場合は TensorFlow を使用します。TensorFlow 使用時は別途 TensorFlow をインストールしてください。
+- `--batch_size` : 一度に処理する画像の数。デフォルトは1です。VRAMの容量に応じて増減してください。
+- `--caption_extension` : キャプションファイルの拡張子。デフォルトは `.txt` です。
+- `--max_data_loader_n_workers` : DataLoader の最大ワーカー数です。このオプションに 1 以上の数値を指定すると、DataLoader を用いて画像読み込みを高速化します。未指定時は DataLoader を用いません。
+- `--thresh` : 出力するタグの信頼度の閾値。デフォルトは0.35です。値を下げるとより多くのタグが付与されますが、精度は下がります。
+- `--general_threshold` : 一般タグの信頼度の閾値。省略時は `--thresh` と同じです。
+- `--character_threshold` : キャラクタータグの信頼度の閾値。省略時は `--thresh` と同じです。
+- `--recursive` : 指定すると、指定したフォルダ内のサブフォルダも再帰的に処理します。
+- `--append_tags` : 既存のタグファイルにタグを追加します。
+- `--frequency_tags` : タグの頻度を出力します。
+- `--debug` : デバッグモード。指定するとデバッグ情報を出力します。
+
+## モデルのダウンロード
+
+- `--model_dir` : モデルファイルの保存先フォルダ。デフォルトは `wd14_tagger_model` です。
+- `--force_download` : 指定するとモデルファイルを再ダウンロードします。
+
+## タグ編集関連
+
+- `--remove_underscore` : 出力するタグからアンダースコアを削除します。
+- `--undesired_tags` : 出力しないタグを指定します。カンマ区切りで複数指定できます。たとえば `black eyes,black hair` のように指定します。
+- `--use_rating_tags` : タグの最初にレーティングタグを出力します。
+- `--use_rating_tags_as_last_tag` : タグの最後にレーティングタグを追加します。
+- `--character_tags_first` : キャラクタータグを最初に出力します。
+- `--character_tag_expand` : キャラクタータグのシリーズ名を展開します。たとえば `chara_name_(series)` のタグを `chara_name, series` に分割します。
+- `--always_first_tags` : あるタグが画像に出力されたとき、そのタグを最初に出力するタグを指定します。カンマ区切りで複数指定できます。たとえば `1girl,1boy` のように指定します。
+- `--caption_separator` : 出力するファイルでタグをこの文字列で区切ります。デフォルトは `, ` です。
+- `--tag_replacement` : タグの置換を行います。`tag1,tag2;tag3,tag4` のように指定します。
+
+`remove_underscore` 指定時は、`undesired_tags`、`always_first_tags`、`tag_replacement` はアンダースコアを含めずに指定してください。
+
+`caption_separator` 指定時は、`undesired_tags`、`always_first_tags` は `caption_separator`  で区切ってください。`tag_replacement` は必ず `,` で区切ってください。
+
diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index 16a26179d..a327bbd61 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -62,12 +62,12 @@ def __getitem__(self, idx):
         try:
             image = Image.open(img_path).convert("RGB")
             image = preprocess_image(image)
-            tensor = torch.tensor(image)
+            # tensor = torch.tensor(image) # これ Tensor に変換する必要ないな……(;･∀･)
         except Exception as e:
             logger.error(f"Could not load image path / 画像を読み込めません: {img_path}, error: {e}")
             return None
 
-        return (tensor, img_path)
+        return (image, img_path)
 
 
 def collate_fn_remove_corrupted(batch):
@@ -110,7 +110,7 @@ def main(args):
     else:
         logger.info("using existing wd14 tagger model")
 
-    # 画像を読み込む
+    # モデルを読み込む
     if args.onnx:
         import torch
         import onnx
@@ -178,8 +178,43 @@ def main(args):
     general_tags = [row[1] for row in rows[0:] if row[2] == "0"]
     character_tags = [row[1] for row in rows[0:] if row[2] == "4"]
 
-    # 画像を読み込む
+    # preprocess tags in advance
+    if args.character_tag_expand:
+        for i, tag in enumerate(character_tags):
+            if tag.endswith(")"):
+                # chara_name_(series) -> chara_name, series
+                # chara_name_(costume)_(series) -> chara_name_(costume), series
+                tags = tag.split("(")
+                character_tag = "(".join(tags[:-1])
+                if character_tag.endswith("_"):
+                    character_tag = character_tag[:-1]
+                series_tag = tags[-1].replace(")", "")
+                character_tags[i] = character_tag + args.caption_separator + series_tag
+
+    if args.remove_underscore:
+        rating_tags = [tag.replace("_", " ") if len(tag) > 3 else tag for tag in rating_tags]
+        general_tags = [tag.replace("_", " ") if len(tag) > 3 else tag for tag in general_tags]
+        character_tags = [tag.replace("_", " ") if len(tag) > 3 else tag for tag in character_tags]
+
+    if args.tag_replacement is not None:
+        # escape , and ; in tag_replacement: wd14 tag names may contain , and ;
+        escaped_tag_replacements = args.tag_replacement.replace("\\,", "@@@@").replace("\\;", "####")
+        tag_replacements = escaped_tag_replacements.split(";")
+        for tag_replacement in tag_replacements:
+            tags = tag_replacement.split(",")  # source, target
+            assert len(tags) == 2, f"tag replacement must be in the format of `source,target` / タグの置換は `置換元,置換先` の形式で指定してください: {args.tag_replacement}"
+
+            source, target = [tag.replace("@@@@", ",").replace("####", ";") for tag in tags]
+            logger.info(f"replacing tag: {source} -> {target}")
+
+            if source in general_tags:
+                general_tags[general_tags.index(source)] = target
+            elif source in character_tags:
+                character_tags[character_tags.index(source)] = target
+            elif source in rating_tags:
+                rating_tags[rating_tags.index(source)] = target
 
+    # 画像を読み込む
     train_data_dir_path = Path(args.train_data_dir)
     image_paths = train_util.glob_images_pathlib(train_data_dir_path, args.recursive)
     logger.info(f"found {len(image_paths)} images.")
@@ -188,7 +223,12 @@ def main(args):
 
     caption_separator = args.caption_separator
     stripped_caption_separator = caption_separator.strip()
-    undesired_tags = set(args.undesired_tags.split(stripped_caption_separator))
+    undesired_tags = args.undesired_tags.split(stripped_caption_separator)
+    undesired_tags = set([tag.strip() for tag in undesired_tags if tag.strip() != ""])
+
+    always_first_tags = None
+    if args.always_first_tags is not None:
+        always_first_tags = [tag for tag in args.always_first_tags.split(stripped_caption_separator) if tag.strip() != ""]
 
     def run_batch(path_imgs):
         imgs = np.array([im for _, im in path_imgs])
@@ -208,13 +248,11 @@ def run_batch(path_imgs):
             character_tag_text = ""
             general_tag_text = ""
 
-            # それ以降はタグなのでconfidenceがthresholdより高いものを追加する
-            # Everything else is tags: pick any where prediction confidence > threshold
+            # 最初の4つ以降はタグなのでconfidenceがthreshold以上のものを追加する
+            # First 4 labels are ratings, the rest are tags: pick any where prediction confidence >= threshold
             for i, p in enumerate(prob[4:]):
                 if i < len(general_tags) and p >= args.general_threshold:
                     tag_name = general_tags[i]
-                    if args.remove_underscore and len(tag_name) > 3:  # ignore emoji tags like >_< and ^_^
-                        tag_name = tag_name.replace("_", " ")
 
                     if tag_name not in undesired_tags:
                         tag_freq[tag_name] = tag_freq.get(tag_name, 0) + 1
@@ -222,30 +260,37 @@ def run_batch(path_imgs):
                         combined_tags.append(tag_name)
                 elif i >= len(general_tags) and p >= args.character_threshold:
                     tag_name = character_tags[i - len(general_tags)]
-                    if args.remove_underscore and len(tag_name) > 3:
-                        tag_name = tag_name.replace("_", " ")
 
                     if tag_name not in undesired_tags:
                         tag_freq[tag_name] = tag_freq.get(tag_name, 0) + 1
                         character_tag_text += caption_separator + tag_name
                         if args.character_tags_first: # insert to the beginning
-                            combined_tags.insert(0,tag_name)
+                            combined_tags.insert(0, tag_name)
                         else:
                             combined_tags.append(tag_name)
 
-            #最初の4つはratingなので無視する
+            # 最初の4つはratingなのでargmaxで選ぶ
             # First 4 labels are actually ratings: pick one with argmax
-            if args.use_rating_tags:
-                ratings_names = prob[:4]
-                rating_index = ratings_names.argmax()
+            if args.use_rating_tags or args.use_rating_tags_as_last_tag:
+                ratings_probs = prob[:4]
+                rating_index = ratings_probs.argmax()
                 found_rating = rating_tags[rating_index]
-                if args.remove_underscore and len(found_rating) > 3:
-                    found_rating = found_rating.replace("_", " ")
 
                 if found_rating not in undesired_tags:
                     tag_freq[found_rating] = tag_freq.get(found_rating, 0) + 1
                     rating_tag_text = found_rating
-                    combined_tags.insert(0,found_rating) # insert to the beginning
+                    if args.use_rating_tags:
+                        combined_tags.insert(0, found_rating) # insert to the beginning
+                    else:
+                        combined_tags.append(found_rating)
+
+            # 一番最初に置くタグを指定する
+            # Always put some tags at the beginning
+            if always_first_tags is not None:
+                for tag in always_first_tags:
+                    if tag in combined_tags:
+                        combined_tags.remove(tag)
+                        combined_tags.insert(0, tag)
 
             # 先頭のカンマを取る
             if len(general_tag_text) > 0:
@@ -303,9 +348,7 @@ def run_batch(path_imgs):
                 continue
 
             image, image_path = data
-            if image is not None:
-                image = image.detach().numpy()
-            else:
+            if image is None:
                 try:
                     image = Image.open(image_path)
                     if image.mode != "RGB":
@@ -407,7 +450,7 @@ def setup_parser() -> argparse.ArgumentParser:
         help="comma-separated list of undesired tags to remove from the output / 出力から除外したいタグのカンマ区切りのリスト",
     )
     parser.add_argument(
-        "--frequency_tags", action="store_true", help="Show frequency of tags for images / 画像ごとのタグの出現頻度を表示する"
+        "--frequency_tags", action="store_true", help="Show frequency of tags for images / タグの出現頻度を表示する"
     )
     parser.add_argument(
         "--onnx", action="store_true", help="use onnx model for inference / onnxモデルを推論に使用する"
@@ -416,10 +459,20 @@ def setup_parser() -> argparse.ArgumentParser:
         "--append_tags", action="store_true", help="Append captions instead of overwriting / 上書きではなくキャプションを追記する"
     )
     parser.add_argument(
-        "--use_rating_tags", action="store_true", help="Adds rating tags as the first tag",
+        "--use_rating_tags", action="store_true", help="Adds rating tags as the first tag / レーティングタグを最初のタグとして追加する",
     )
     parser.add_argument(
-        "--character_tags_first", action="store_true", help="Always inserts character tags before the general tags",
+        "--use_rating_tags_as_last_tag", action="store_true", help="Adds rating tags as the last tag / レーティングタグを最後のタグとして追加する",
+    )
+    parser.add_argument(
+        "--character_tags_first", action="store_true", help="Always inserts character tags before the general tags / characterタグを常にgeneralタグの前に出力する",
+    )
+    parser.add_argument(
+        "--always_first_tags",
+        type=str,
+        default=None,
+        help="comma-separated list of tags to always put at the beginning, e.g. `1girl,1boy`"
+        + " / 必ず先頭に置くタグのカンマ区切りリスト、例 : `1girl,1boy`",
     )
     parser.add_argument(
         "--caption_separator",
@@ -427,6 +480,19 @@ def setup_parser() -> argparse.ArgumentParser:
         default=", ",
         help="Separator for captions, include space if needed / キャプションの区切り文字、必要ならスペースを含めてください",
     )
+    parser.add_argument(
+        "--tag_replacement",
+        type=str,
+        default=None,
+        help="tag replacement in the format of `source1,target1;source2,target2; ...`. Escape `,` and `;` with `\`. e.g. `tag1,tag2;tag3,tag4`"
+        + " / タグの置換を `置換元1,置換先1;置換元2,置換先2; ...`で指定する。`\` で `,` と `;` をエスケープできる。例: `tag1,tag2;tag3,tag4`",
+    )
+    parser.add_argument(
+        "--character_tag_expand",
+        action="store_true",
+        help="expand tag tail parenthesis to another tag for character tags. `chara_name_(series)` becomes `chara_name, series`"
+        + " / キャラクタタグの末尾の括弧を別のタグに展開する。`chara_name_(series)` は `chara_name, series` になる",
+    )
 
     return parser
 

From f5323e3c4b467c09650362ecb5a509b43a1798be Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 30 Mar 2024 22:10:37 +0900
Subject: [PATCH 063/132] update tagger doc

---
 docs/wd14_tagger_README-en.md | 5 ++++-
 docs/wd14_tagger_README-ja.md | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/wd14_tagger_README-en.md b/docs/wd14_tagger_README-en.md
index 7ac7f2ed0..4bab4275b 100644
--- a/docs/wd14_tagger_README-en.md
+++ b/docs/wd14_tagger_README-en.md
@@ -78,7 +78,10 @@ python tag_images_by_wd14_tagger.py --onnx --repo_id SmilingWolf/wd-swinv2-tagge
 - `--character_tag_expand`: Expand character tag series names. For example, split the tag `chara_name_(series)` into `chara_name, series`.  
 - `--always_first_tags`: Specify tags to always output first when a certain tag appears in an image. Multiple tags can be specified, separated by commas. For example, `1girl,1boy`.
 - `--caption_separator`: Separate tags with this string in the output file. Default is `, `.
-- `--tag_replacement`: Perform tag replacement. Specify in the format `tag1,tag2;tag3,tag4`. 
+- `--tag_replacement`: Perform tag replacement. Specify in the format `tag1,tag2;tag3,tag4`. For example, specify `aira tsubase,aira tsubase (uniform)` (when you want to train a specific costume).
+
+
+When using `tag_replacement`, it is applied after `character_tag_expand`.
 
 When specifying `remove_underscore`, specify `undesired_tags`, `always_first_tags`, and `tag_replacement` without including underscores.
 
diff --git a/docs/wd14_tagger_README-ja.md b/docs/wd14_tagger_README-ja.md
index c75b77cbe..3f1cf298f 100644
--- a/docs/wd14_tagger_README-ja.md
+++ b/docs/wd14_tagger_README-ja.md
@@ -77,7 +77,9 @@ python tag_images_by_wd14_tagger.py --onnx --repo_id SmilingWolf/wd-swinv2-tagge
 - `--character_tag_expand` : キャラクタータグのシリーズ名を展開します。たとえば `chara_name_(series)` のタグを `chara_name, series` に分割します。
 - `--always_first_tags` : あるタグが画像に出力されたとき、そのタグを最初に出力するタグを指定します。カンマ区切りで複数指定できます。たとえば `1girl,1boy` のように指定します。
 - `--caption_separator` : 出力するファイルでタグをこの文字列で区切ります。デフォルトは `, ` です。
-- `--tag_replacement` : タグの置換を行います。`tag1,tag2;tag3,tag4` のように指定します。
+- `--tag_replacement` : タグの置換を行います。`tag1,tag2;tag3,tag4` のように指定します。たとえば `aira tsubase,aira tsubase (uniform)` （特定の衣装を学習させたいとき）のように指定します。
+
+`tag_replacement` は `character_tag_expand` の後に適用されます。
 
 `remove_underscore` 指定時は、`undesired_tags`、`always_first_tags`、`tag_replacement` はアンダースコアを含めずに指定してください。
 

From 2c2ca9d7260d88038f85a517b786a204560497dd Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 30 Mar 2024 22:55:56 +0900
Subject: [PATCH 064/132] update tagger doc

---
 docs/wd14_tagger_README-en.md | 4 ++--
 docs/wd14_tagger_README-ja.md | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/wd14_tagger_README-en.md b/docs/wd14_tagger_README-en.md
index 4bab4275b..422399a33 100644
--- a/docs/wd14_tagger_README-en.md
+++ b/docs/wd14_tagger_README-en.md
@@ -78,8 +78,8 @@ python tag_images_by_wd14_tagger.py --onnx --repo_id SmilingWolf/wd-swinv2-tagge
 - `--character_tag_expand`: Expand character tag series names. For example, split the tag `chara_name_(series)` into `chara_name, series`.  
 - `--always_first_tags`: Specify tags to always output first when a certain tag appears in an image. Multiple tags can be specified, separated by commas. For example, `1girl,1boy`.
 - `--caption_separator`: Separate tags with this string in the output file. Default is `, `.
-- `--tag_replacement`: Perform tag replacement. Specify in the format `tag1,tag2;tag3,tag4`. For example, specify `aira tsubase,aira tsubase (uniform)` (when you want to train a specific costume).
-
+- `--tag_replacement`: Perform tag replacement. Specify in the format `tag1,tag2;tag3,tag4`. If using `,` and `;`, escape them with `\`. \
+    For example, specify `aira tsubase,aira tsubase (uniform)` (when you want to train a specific costume), `aira tsubase,aira tsubase\, heir of shadows` (when the series name is not included in the tag).
 
 When using `tag_replacement`, it is applied after `character_tag_expand`.
 
diff --git a/docs/wd14_tagger_README-ja.md b/docs/wd14_tagger_README-ja.md
index 3f1cf298f..50d754ed4 100644
--- a/docs/wd14_tagger_README-ja.md
+++ b/docs/wd14_tagger_README-ja.md
@@ -77,7 +77,8 @@ python tag_images_by_wd14_tagger.py --onnx --repo_id SmilingWolf/wd-swinv2-tagge
 - `--character_tag_expand` : キャラクタータグのシリーズ名を展開します。たとえば `chara_name_(series)` のタグを `chara_name, series` に分割します。
 - `--always_first_tags` : あるタグが画像に出力されたとき、そのタグを最初に出力するタグを指定します。カンマ区切りで複数指定できます。たとえば `1girl,1boy` のように指定します。
 - `--caption_separator` : 出力するファイルでタグをこの文字列で区切ります。デフォルトは `, ` です。
-- `--tag_replacement` : タグの置換を行います。`tag1,tag2;tag3,tag4` のように指定します。たとえば `aira tsubase,aira tsubase (uniform)` （特定の衣装を学習させたいとき）のように指定します。
+- `--tag_replacement` : タグの置換を行います。`tag1,tag2;tag3,tag4` のように指定します。`,` および `;` を使う場合は `\` でエスケープしてください。\
+    たとえば `aira tsubase,aira tsubase (uniform)` （特定の衣装を学習させたいとき）、`aira tsubase,aira tsubase\, heir of shadows` （シリーズ名がタグに含まれないとき）のように指定します。
 
 `tag_replacement` は `character_tag_expand` の後に適用されます。
 

From 059ee047f3c90f37232eac9bc48c0711925d1b0b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 30 Mar 2024 23:02:24 +0900
Subject: [PATCH 065/132] fix typo

---
 docs/wd14_tagger_README-en.md | 2 +-
 docs/wd14_tagger_README-ja.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/wd14_tagger_README-en.md b/docs/wd14_tagger_README-en.md
index 422399a33..34f448823 100644
--- a/docs/wd14_tagger_README-en.md
+++ b/docs/wd14_tagger_README-en.md
@@ -39,7 +39,7 @@ To output in the Animagine XL 3.1 format, it would be as follows (enter on a sin
 ```
 python tag_images_by_wd14_tagger.py --onnx --repo_id SmilingWolf/wd-swinv2-tagger-v3 
     --batch_size 4  --remove_underscore --undesired_tags "PUT,YOUR,UNDESIRED,TAGS" --recursive 
-    --use_rating_tagss_as_last_tag --character_tags_first --character_tag_expand 
+    --use_rating_tags_as_last_tag --character_tags_first --character_tag_expand 
     --always_first_tags "1girl,1boy"  ..\train_data
 ```
 
diff --git a/docs/wd14_tagger_README-ja.md b/docs/wd14_tagger_README-ja.md
index 50d754ed4..58e9ede95 100644
--- a/docs/wd14_tagger_README-ja.md
+++ b/docs/wd14_tagger_README-ja.md
@@ -38,7 +38,7 @@ Animagine XL 3.1 方式で出力する場合、以下のようになります（
 ```
 python tag_images_by_wd14_tagger.py --onnx --repo_id SmilingWolf/wd-swinv2-tagger-v3 
     --batch_size 4  --remove_underscore --undesired_tags "PUT,YOUR,UNDESIRED,TAGS" --recursive 
-    --use_rating_tagss_as_last_tag --character_tags_first --character_tag_expand 
+    --use_rating_tags_as_last_tag --character_tags_first --character_tag_expand 
     --always_first_tags "1girl,1boy"  ..\train_data
 ```
 

From 2258a1b753a321ed25b9ae1b7f2ceb1b24ae0736 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 31 Mar 2024 15:50:35 +0900
Subject: [PATCH 066/132] add save/load hook to remove U-Net/TEs from state

---
 README.md        | 12 ++++++++++--
 train_network.py | 25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 71614b9c6..87cdf0b76 100644
--- a/README.md
+++ b/README.md
@@ -114,6 +114,10 @@ pip install --use-pep517 --upgrade -r requirements.txt
 
 Once the commands have completed successfully you should be ready to use the new version.
 
+### Upgrade PyTorch
+
+If you want to upgrade PyTorch, you can upgrade it with `pip install` command in [Windows Installation](#windows-installation) section. `xformers` is also required to be upgraded when PyTorch is upgraded.
+
 ## Credits
 
 The implementation for LoRA is based on [cloneofsimo's repo](https://github.com/cloneofsimo/lora). Thank you for great work!
@@ -137,12 +141,14 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 
 - The dependent libraries are updated. Please see [Upgrade](#upgrade) and update the libraries.
-  - Especially `imagesize` is newly added, so if you cannot update immediately, please install with `pip install imagesize==1.4.1`.
+  - Especially `imagesize` is newly added, so if you cannot update the libraries immediately, please install with `pip install imagesize==1.4.1` separately.
   - `bitsandbytes==0.43.0`, `prodigyopt==1.0`, `lion-pytorch==0.0.6` are included in the requirements.txt.
+  - Also, the PyTorch version is updated to 2.1.2 (PyTorch does not need to be updated immediately). In the upgrade procedure, PyTorch is not updated, so please manually install or update torch, torchvision, xformers if necessary (see [Upgrade PyTorch](#upgrade-pytorch)).
 - Colab seems to stop with log output. Try specifying `--console_log_simple` option in the training script to disable rich logging.
 - The `.toml` file for the dataset config is now read in UTF-8 encoding. PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Thanks to Horizon1704!
 - Fixed a bug that the last subset settings are applied to all images when multiple subsets of regularization images are specified in the dataset settings. The settings for each subset are correctly applied to each image. PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) Thanks to feffy380!
 - `train_network.py` and `sdxl_train_network.py` are modified to record some dataset settings in the metadata of the trained model (`caption_prefix`, `caption_suffix`, `keep_tokens_separator`, `secondary_separator`, `enable_wildcard`).
+- Fixed a bug that U-Net and Text Encoders are included in the state in `train_network.py` and `sdxl_train_network.py`. The saving and loading of the state are faster, the file size is smaller, and the memory usage when loading is reduced.
 - DeepSpeed is supported. PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101)  and [#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) Thanks to BootsofLagrangian! See PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) for details.
 - The masked loss is supported in each training script. PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) See [Masked loss](#masked-loss) for details.
 - Some features are added to the dataset subset settings.
@@ -171,12 +177,14 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 - The [English version of the dataset settings documentation](./docs/config_README-en.md) is added. PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) Thanks to darkstorm2150!
 
 - 依存ライブラリが更新されました。[アップグレード](./README-ja.md#アップグレード) を参照しライブラリを更新してください。
-  - 特に `imagesize` が新しく追加されていますので、すぐに更新ができない場合は `pip install imagesize==1.4.1` でインストールしてください。
+  - 特に `imagesize` が新しく追加されていますので、すぐにライブラリの更新ができない場合は `pip install imagesize==1.4.1` で個別にインストールしてください。
   - `bitsandbytes==0.43.0`、`prodigyopt==1.0`、`lion-pytorch==0.0.6` が requirements.txt に含まれるようになりました。
+  - また PyTorch のバージョンを 2.1.2 に更新しました。PyTorch はすぐに更新する必要はありません。更新時は、アップグレードの手順では PyTorch が更新されませんので、torch、torchvision、xformers を手動でインストールしてください。
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
 - データセット設定の `.toml` ファイルが UTF-8 encoding で読み込まれるようになりました。PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Horizon1704 氏に感謝します。
 - データセット設定で、正則化画像のサブセットを複数指定した時、最後のサブセットの各種設定がすべてのサブセットの画像に適用される不具合が修正されました。それぞれのサブセットの設定が、それぞれの画像に正しく適用されます。PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) feffy380 氏に感謝します。
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
+- `train_network.py` および `sdxl_train_network.py` で、state に U-Net および Text Encoder が含まれる不具合を修正しました。state の保存、読み込みが高速化され、ファイルサイズも小さくなり、また読み込み時のメモリ使用量も削減されます。
 - DeepSpeed がサポートされました。PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) 、[#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) BootsofLagrangian 氏に感謝します。詳細は PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) をご覧ください。
 - 各学習スクリプトでマスクロスをサポートしました。PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) 詳細は [Masked loss](#masked-loss) をご覧ください。
 - データセットのサブセット設定にいくつかの機能を追加しました。
diff --git a/train_network.py b/train_network.py
index ed569aea6..8fe98f126 100644
--- a/train_network.py
+++ b/train_network.py
@@ -471,6 +471,31 @@ def train(self, args):
         if args.full_fp16:
             train_util.patch_accelerator_for_fp16_training(accelerator)
 
+        # before resuming make hook for saving/loading to save/load the network weights only
+        def save_model_hook(models, weights, output_dir):
+            # pop weights of other models than network to save only network weights
+            if accelerator.is_main_process:
+                remove_indices = []
+                for i,model in enumerate(models):
+                    if not isinstance(model, type(accelerator.unwrap_model(network))):
+                        remove_indices.append(i)
+                for i in reversed(remove_indices):
+                    weights.pop(i)
+                # print(f"save model hook: {len(weights)} weights will be saved")
+
+        def load_model_hook(models, input_dir):
+            # remove models except network
+            remove_indices = []
+            for i, model in enumerate(models):
+                if not isinstance(model, type(accelerator.unwrap_model(network))):
+                    remove_indices.append(i)
+            for i in reversed(remove_indices):
+                models.pop(i)
+            # print(f"load model hook: {len(models)} models will be loaded")
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
         # resumeする
         train_util.resume_from_local_or_hf_if_specified(accelerator, args)
 

From 80e9f7223477418731478f2fa6e6e9d00b41e358 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 1 Apr 2024 01:50:22 +0000
Subject: [PATCH 067/132] Bump crate-ci/typos from 1.17.2 to 1.19.0

Bumps [crate-ci/typos](https://github.com/crate-ci/typos) from 1.17.2 to 1.19.0.
- [Release notes](https://github.com/crate-ci/typos/releases)
- [Changelog](https://github.com/crate-ci/typos/blob/master/CHANGELOG.md)
- [Commits](https://github.com/crate-ci/typos/compare/v1.17.2...v1.19.0)

---
updated-dependencies:
- dependency-name: crate-ci/typos
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/typos.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
index c9edf2650..e8b06483f 100644
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -18,4 +18,4 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: typos-action
-        uses: crate-ci/typos@v1.17.2
+        uses: crate-ci/typos@v1.19.0

From f99fe281cbb6519b7b5f1199c570d496ad4df474 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 1 Apr 2024 15:38:26 -0400
Subject: [PATCH 068/132] Add LoRA+ support

---
 library/train_util.py |  2 ++
 networks/dylora.py    | 45 ++++++++++++++++++++++++++----------
 networks/lora.py      | 54 ++++++++++++++++++++++++++++---------------
 train_network.py      |  2 +-
 4 files changed, 71 insertions(+), 32 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index d2b69edb5..4e5ab7370 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2789,6 +2789,8 @@ def add_optimizer_arguments(parser: argparse.ArgumentParser):
         default=1,
         help="Polynomial power for polynomial scheduler / polynomialスケジューラでのpolynomial power",
     )
+    parser.add_argument("--loraplus_unet_lr_ratio", default=None, type=float, help="LoRA+ UNet learning rate ratio")
+    parser.add_argument("--loraplus_text_encoder_lr_ratio", default=None, type=float, help="LoRA+ text encoder learning rate ratio")
 
 
 def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth: bool):
diff --git a/networks/dylora.py b/networks/dylora.py
index 637f33450..a73ade8bd 100644
--- a/networks/dylora.py
+++ b/networks/dylora.py
@@ -406,27 +406,48 @@ def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
         logger.info(f"weights are merged")
     """
 
-    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
+    # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr, unet_lora_plus_ratio=None, text_encoder_lora_plus_ratio=None):
         self.requires_grad_(True)
         all_params = []
 
-        def enumerate_params(loras):
-            params = []
+        def assemble_params(loras, lr, lora_plus_ratio):
+            param_groups = {"lora": {}, "plus": {}}
             for lora in loras:
-                params.extend(lora.parameters())
+                for name, param in lora.named_parameters():
+                    if lora_plus_ratio is not None and "lora_up" in name:
+                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
+                    else:
+                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
+
+            # assigned_param_groups = ""
+            # for group in param_groups:
+            #     assigned_param_groups += f"{group}\n {list(param_groups[group].keys())}\n\n"
+            # logger.info(assigned_param_groups)
+
+            params = []
+            for key in param_groups.keys():
+                param_data = {"params": param_groups[key].values()}
+                if lr is not None:
+                    if key == "plus":
+                        param_data["lr"] = lr * lora_plus_ratio
+                    else:
+                        param_data["lr"] = lr
+
+                if ("lr" in param_data) and (param_data["lr"] == 0):
+                    continue
+
+                params.append(param_data)
+
             return params
 
         if self.text_encoder_loras:
-            param_data = {"params": enumerate_params(self.text_encoder_loras)}
-            if text_encoder_lr is not None:
-                param_data["lr"] = text_encoder_lr
-            all_params.append(param_data)
+            params = assemble_params(self.text_encoder_loras, text_encoder_lr, text_encoder_lora_plus_ratio)
+            all_params.extend(params)
 
         if self.unet_loras:
-            param_data = {"params": enumerate_params(self.unet_loras)}
-            if unet_lr is not None:
-                param_data["lr"] = unet_lr
-            all_params.append(param_data)
+            params = assemble_params(self.unet_loras, unet_lr, unet_lora_plus_ratio)
+            all_params.extend(params)
 
         return all_params
 
diff --git a/networks/lora.py b/networks/lora.py
index 948b30b0e..8d7619777 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -1035,21 +1035,43 @@ def get_lr_weight(self, lora: LoRAModule) -> float:
         return lr_weight
 
     # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
-    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr, unet_lora_plus_ratio=None, text_encoder_lora_plus_ratio=None):
         self.requires_grad_(True)
         all_params = []
 
-        def enumerate_params(loras):
-            params = []
+        def assemble_params(loras, lr, lora_plus_ratio):
+            param_groups = {"lora": {}, "plus": {}}
             for lora in loras:
-                params.extend(lora.parameters())
+                for name, param in lora.named_parameters():
+                    if lora_plus_ratio is not None and "lora_up" in name:
+                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
+                    else:
+                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
+
+            # assigned_param_groups = ""
+            # for group in param_groups:
+            #     assigned_param_groups += f"{group}\n {list(param_groups[group].keys())}\n\n"
+            # logger.info(assigned_param_groups)
+
+            params = []
+            for key in param_groups.keys():
+                param_data = {"params": param_groups[key].values()}
+                if lr is not None:
+                    if key == "plus":
+                        param_data["lr"] = lr * lora_plus_ratio
+                    else:
+                        param_data["lr"] = lr
+
+                if ("lr" in param_data) and (param_data["lr"] == 0):
+                    continue
+
+                params.append(param_data)
+
             return params
 
         if self.text_encoder_loras:
-            param_data = {"params": enumerate_params(self.text_encoder_loras)}
-            if text_encoder_lr is not None:
-                param_data["lr"] = text_encoder_lr
-            all_params.append(param_data)
+            params = assemble_params(self.text_encoder_loras, text_encoder_lr, text_encoder_lora_plus_ratio)
+            all_params.extend(params)
 
         if self.unet_loras:
             if self.block_lr:
@@ -1063,21 +1085,15 @@ def enumerate_params(loras):
 
                 # blockごとにパラメータを設定する
                 for idx, block_loras in block_idx_to_lora.items():
-                    param_data = {"params": enumerate_params(block_loras)}
-
                     if unet_lr is not None:
-                        param_data["lr"] = unet_lr * self.get_lr_weight(block_loras[0])
+                        params = assemble_params(block_loras, unet_lr * self.get_lr_weight(block_loras[0]), unet_lora_plus_ratio)
                     elif default_lr is not None:
-                        param_data["lr"] = default_lr * self.get_lr_weight(block_loras[0])
-                    if ("lr" in param_data) and (param_data["lr"] == 0):
-                        continue
-                    all_params.append(param_data)
+                        params = assemble_params(block_loras, default_lr * self.get_lr_weight(block_loras[0]), unet_lora_plus_ratio)
+                    all_params.extend(params)
 
             else:
-                param_data = {"params": enumerate_params(self.unet_loras)}
-                if unet_lr is not None:
-                    param_data["lr"] = unet_lr
-                all_params.append(param_data)
+                params = assemble_params(self.unet_loras, unet_lr, unet_lora_plus_ratio)
+                all_params.extend(params)
 
         return all_params
 
diff --git a/train_network.py b/train_network.py
index e0fa69458..ba0c124d1 100644
--- a/train_network.py
+++ b/train_network.py
@@ -339,7 +339,7 @@ def train(self, args):
 
         # 後方互換性を確保するよ
         try:
-            trainable_params = network.prepare_optimizer_params(args.text_encoder_lr, args.unet_lr, args.learning_rate)
+            trainable_params = network.prepare_optimizer_params(args.text_encoder_lr, args.unet_lr, args.learning_rate, args.loraplus_text_encoder_lr_ratio, args.loraplus_unet_lr_ratio)
         except TypeError:
             accelerator.print(
                 "Deprecated: use prepare_optimizer_params(text_encoder_lr, unet_lr, learning_rate) instead of prepare_optimizer_params(text_encoder_lr, unet_lr)"

From c7691607ea1647864b5149c98434a27f23386c65 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 1 Apr 2024 15:43:04 -0400
Subject: [PATCH 069/132] Add LoRA-FA for LoRA+

---
 networks/lora_fa.py | 58 +++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/networks/lora_fa.py b/networks/lora_fa.py
index 919222ce8..fcc503e89 100644
--- a/networks/lora_fa.py
+++ b/networks/lora_fa.py
@@ -1033,22 +1033,43 @@ def get_lr_weight(self, lora: LoRAModule) -> float:
         return lr_weight
 
     # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
-    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr, , unet_lora_plus_ratio=None, text_encoder_lora_plus_ratio=None):
         self.requires_grad_(True)
         all_params = []
 
-        def enumerate_params(loras: List[LoRAModule]):
-            params = []
+        def assemble_params(loras: List[LoRAModule], lr, lora_plus_ratio):
+            param_groups = {"lora": {}, "plus": {}}
             for lora in loras:
-                # params.extend(lora.parameters())
-                params.extend(lora.get_trainable_params())
+                for name, param in lora.get_trainable_named_params():
+                    if lora_plus_ratio is not None and "lora_up" in name:
+                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
+                    else:
+                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
+
+            # assigned_param_groups = ""
+            # for group in param_groups:
+            #     assigned_param_groups += f"{group}\n {list(param_groups[group].keys())}\n\n"
+            # logger.info(assigned_param_groups)
+
+            params = []
+            for key in param_groups.keys():
+                param_data = {"params": param_groups[key].values()}
+                if lr is not None:
+                    if key == "plus":
+                        param_data["lr"] = lr * lora_plus_ratio
+                    else:
+                        param_data["lr"] = lr
+
+                if ("lr" in param_data) and (param_data["lr"] == 0):
+                    continue
+
+                params.append(param_data)
+
             return params
 
         if self.text_encoder_loras:
-            param_data = {"params": enumerate_params(self.text_encoder_loras)}
-            if text_encoder_lr is not None:
-                param_data["lr"] = text_encoder_lr
-            all_params.append(param_data)
+            params = assemble_params(self.text_encoder_loras, text_encoder_lr, text_encoder_lora_plus_ratio)
+            all_params.extend(params)
 
         if self.unet_loras:
             if self.block_lr:
@@ -1062,21 +1083,15 @@ def enumerate_params(loras: List[LoRAModule]):
 
                 # blockごとにパラメータを設定する
                 for idx, block_loras in block_idx_to_lora.items():
-                    param_data = {"params": enumerate_params(block_loras)}
-
                     if unet_lr is not None:
-                        param_data["lr"] = unet_lr * self.get_lr_weight(block_loras[0])
+                        params = assemble_params(block_loras, unet_lr * self.get_lr_weight(block_loras[0]), unet_lora_plus_ratio)
                     elif default_lr is not None:
-                        param_data["lr"] = default_lr * self.get_lr_weight(block_loras[0])
-                    if ("lr" in param_data) and (param_data["lr"] == 0):
-                        continue
-                    all_params.append(param_data)
+                        params = assemble_params(block_loras, default_lr * self.get_lr_weight(block_loras[0]), unet_lora_plus_ratio)
+                    all_params.extend(params)
 
             else:
-                param_data = {"params": enumerate_params(self.unet_loras)}
-                if unet_lr is not None:
-                    param_data["lr"] = unet_lr
-                all_params.append(param_data)
+                params = assemble_params(self.unet_loras, unet_lr, unet_lora_plus_ratio)
+                all_params.extend(params)
 
         return all_params
 
@@ -1093,6 +1108,9 @@ def on_epoch_start(self, text_encoder, unet):
     def get_trainable_params(self):
         return self.parameters()
 
+    def get_trainable_named_params(self):
+        return self.named_parameters()
+
     def save_weights(self, file, dtype, metadata):
         if metadata is not None and len(metadata) == 0:
             metadata = None

From b748b48dbbaf6ddd011f032f5ede47aea094a208 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 3 Apr 2024 12:43:08 +0900
Subject: [PATCH 070/132] fix attention couple+deep shink cause error in some
 reso

---
 networks/lora.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/networks/lora.py b/networks/lora.py
index 948b30b0e..d1208040f 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -247,14 +247,13 @@ def get_mask_for_x(self, x):
             area = x.size()[1]
 
         mask = self.network.mask_dic.get(area, None)
-        if mask is None:
-            # raise ValueError(f"mask is None for resolution {area}")
+        if mask is None or len(x.size()) == 2:
             # emb_layers in SDXL doesn't have mask
             # if "emb" not in self.lora_name:
             #     print(f"mask is None for resolution {self.lora_name}, {area}, {x.size()}")
             mask_size = (1, x.size()[1]) if len(x.size()) == 2 else (1, *x.size()[1:-1], 1)
             return torch.ones(mask_size, dtype=x.dtype, device=x.device) / self.network.num_sub_prompts
-        if len(x.size()) != 4:
+        if len(x.size()) == 3:
             mask = torch.reshape(mask, (1, -1, 1))
         return mask
 

From 1933ab4b4848b1f8b578c10f25bd050f5e246ac0 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Wed, 3 Apr 2024 12:46:34 -0400
Subject: [PATCH 071/132] Fix default_lr being applied

---
 networks/dylora.py  | 21 ++++++++++++++++++---
 networks/lora.py    | 30 +++++++++++++++++++++++-------
 networks/lora_fa.py | 30 +++++++++++++++++++++++-------
 3 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/networks/dylora.py b/networks/dylora.py
index a73ade8bd..edc3e2229 100644
--- a/networks/dylora.py
+++ b/networks/dylora.py
@@ -407,7 +407,14 @@ def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
     """
 
     # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
-    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr, unet_lora_plus_ratio=None, text_encoder_lora_plus_ratio=None):
+    def prepare_optimizer_params(
+        self,
+        text_encoder_lr,
+        unet_lr,
+        default_lr,
+        unet_lora_plus_ratio=None,
+        text_encoder_lora_plus_ratio=None
+    ):
         self.requires_grad_(True)
         all_params = []
 
@@ -442,11 +449,19 @@ def assemble_params(loras, lr, lora_plus_ratio):
             return params
 
         if self.text_encoder_loras:
-            params = assemble_params(self.text_encoder_loras, text_encoder_lr, text_encoder_lora_plus_ratio)
+            params = assemble_params(
+                self.text_encoder_loras,
+                text_encoder_lr if text_encoder_lr is not None else default_lr,
+                text_encoder_lora_plus_ratio
+            )
             all_params.extend(params)
 
         if self.unet_loras:
-            params = assemble_params(self.unet_loras, unet_lr, unet_lora_plus_ratio)
+            params = assemble_params(
+                self.unet_loras,
+                default_lr if unet_lr is None else unet_lr,
+                unet_lora_plus_ratio
+            )
             all_params.extend(params)
 
         return all_params
diff --git a/networks/lora.py b/networks/lora.py
index 8d7619777..e082941e5 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -1035,7 +1035,14 @@ def get_lr_weight(self, lora: LoRAModule) -> float:
         return lr_weight
 
     # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
-    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr, unet_lora_plus_ratio=None, text_encoder_lora_plus_ratio=None):
+    def prepare_optimizer_params(
+        self,
+        text_encoder_lr,
+        unet_lr,
+        default_lr,
+        unet_lora_plus_ratio=None,
+        text_encoder_lora_plus_ratio=None
+    ):
         self.requires_grad_(True)
         all_params = []
 
@@ -1070,7 +1077,11 @@ def assemble_params(loras, lr, lora_plus_ratio):
             return params
 
         if self.text_encoder_loras:
-            params = assemble_params(self.text_encoder_loras, text_encoder_lr, text_encoder_lora_plus_ratio)
+            params = assemble_params(
+                self.text_encoder_loras,
+                text_encoder_lr if text_encoder_lr is not None else default_lr,
+                text_encoder_lora_plus_ratio
+            )
             all_params.extend(params)
 
         if self.unet_loras:
@@ -1085,14 +1096,19 @@ def assemble_params(loras, lr, lora_plus_ratio):
 
                 # blockごとにパラメータを設定する
                 for idx, block_loras in block_idx_to_lora.items():
-                    if unet_lr is not None:
-                        params = assemble_params(block_loras, unet_lr * self.get_lr_weight(block_loras[0]), unet_lora_plus_ratio)
-                    elif default_lr is not None:
-                        params = assemble_params(block_loras, default_lr * self.get_lr_weight(block_loras[0]), unet_lora_plus_ratio)
+                    params = assemble_params(
+                        block_loras,
+                        (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(block_loras[0]),
+                        unet_lora_plus_ratio
+                    )
                     all_params.extend(params)
 
             else:
-                params = assemble_params(self.unet_loras, unet_lr, unet_lora_plus_ratio)
+                params = assemble_params(
+                    self.unet_loras,
+                    default_lr if unet_lr is None else unet_lr,
+                    unet_lora_plus_ratio
+                )
                 all_params.extend(params)
 
         return all_params
diff --git a/networks/lora_fa.py b/networks/lora_fa.py
index fcc503e89..3f6774dd8 100644
--- a/networks/lora_fa.py
+++ b/networks/lora_fa.py
@@ -1033,7 +1033,14 @@ def get_lr_weight(self, lora: LoRAModule) -> float:
         return lr_weight
 
     # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
-    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr, , unet_lora_plus_ratio=None, text_encoder_lora_plus_ratio=None):
+    def prepare_optimizer_params(
+        self,
+        text_encoder_lr,
+        unet_lr,
+        default_lr,
+        unet_lora_plus_ratio=None,
+        text_encoder_lora_plus_ratio=None
+    ):
         self.requires_grad_(True)
         all_params = []
 
@@ -1068,7 +1075,11 @@ def assemble_params(loras: List[LoRAModule], lr, lora_plus_ratio):
             return params
 
         if self.text_encoder_loras:
-            params = assemble_params(self.text_encoder_loras, text_encoder_lr, text_encoder_lora_plus_ratio)
+            params = assemble_params(
+                self.text_encoder_loras,
+                text_encoder_lr if text_encoder_lr is not None else default_lr,
+                text_encoder_lora_plus_ratio
+            )
             all_params.extend(params)
 
         if self.unet_loras:
@@ -1083,14 +1094,19 @@ def assemble_params(loras: List[LoRAModule], lr, lora_plus_ratio):
 
                 # blockごとにパラメータを設定する
                 for idx, block_loras in block_idx_to_lora.items():
-                    if unet_lr is not None:
-                        params = assemble_params(block_loras, unet_lr * self.get_lr_weight(block_loras[0]), unet_lora_plus_ratio)
-                    elif default_lr is not None:
-                        params = assemble_params(block_loras, default_lr * self.get_lr_weight(block_loras[0]), unet_lora_plus_ratio)
+                    params = assemble_params(
+                        block_loras,
+                        (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(block_loras[0]),
+                        unet_lora_plus_ratio
+                    )
                     all_params.extend(params)
 
             else:
-                params = assemble_params(self.unet_loras, unet_lr, unet_lora_plus_ratio)
+                params = assemble_params(
+                    self.unet_loras,
+                    default_lr if unet_lr is None else unet_lr,
+                    unet_lora_plus_ratio
+                )
                 all_params.extend(params)
 
         return all_params

From cd587ce62cd340de32f8bf72e6eae209ce5d5580 Mon Sep 17 00:00:00 2001
From: ykume <ykumeykume@gmail.com>
Date: Fri, 5 Apr 2024 08:23:03 +0900
Subject: [PATCH 072/132] verify command line args if wandb is enabled

---
 fine_tune.py                         |  1 +
 library/train_util.py                | 56 +++++++++++++++++++++++++++-
 sdxl_train.py                        |  1 +
 sdxl_train_control_net_lllite.py     |  1 +
 sdxl_train_control_net_lllite_old.py |  1 +
 sdxl_train_network.py                |  1 +
 sdxl_train_textual_inversion.py      |  1 +
 train_controlnet.py                  |  1 +
 train_db.py                          |  1 +
 train_network.py                     |  1 +
 train_textual_inversion.py           |  1 +
 train_textual_inversion_XTI.py       |  1 +
 12 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/fine_tune.py b/fine_tune.py
index a0350ce18..3c4a5a26b 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -520,6 +520,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/library/train_util.py b/library/train_util.py
index 1a46f6a7d..c13bb68ee 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1890,7 +1890,7 @@ def __init__(
                 subset.image_dir,
                 False,
                 None,
-                subset.caption_extension, 
+                subset.caption_extension,
                 subset.cache_info,
                 subset.num_repeats,
                 subset.shuffle_caption,
@@ -3358,6 +3358,60 @@ def add_masked_loss_arguments(parser: argparse.ArgumentParser):
     )
 
 
+# verify command line args for training
+def verify_command_line_training_args(args: argparse.Namespace):
+    # if wandb is enabled, the command line is exposed to the public
+    # check whether sensitive options are included in the command line arguments
+    # if so, warn or inform the user to move them to the configuration file
+    # wandbが有効な場合、コマンドラインが公開される
+    # 学習用のコマンドライン引数に敏感なオプションが含まれているかどうかを確認し、
+    # 含まれている場合は設定ファイルに移動するようにユーザーに警告または通知する
+
+    wandb_enabled = args.log_with is not None and args.log_with != "tensorboard"  # "all" or "wandb"
+    if not wandb_enabled:
+        return
+
+    sensitive_args = ["wandb_api_key", "huggingface_token"]
+    sensitive_path_args = [
+        "pretrained_model_name_or_path",
+        "vae",
+        "tokenizer_cache_dir",
+        "train_data_dir",
+        "conditioning_data_dir",
+        "reg_data_dir",
+        "output_dir",
+        "logging_dir",
+    ]
+
+    for arg in sensitive_args:
+        if getattr(args, arg, None) is not None:
+            logger.warning(
+                f"wandb is enabled, but option `{arg}` is included in the command line. Because the command line is exposed to the public, it is recommended to move it to the `.toml` file."
+                + f" / wandbが有効で、かつオプション `{arg}` がコマンドラインに含まれています。コマンドラインは公開されるため、`.toml`ファイルに移動することをお勧めします。"
+            )
+
+    # if path is absolute, it may include sensitive information
+    for arg in sensitive_path_args:
+        if getattr(args, arg, None) is not None and os.path.isabs(getattr(args, arg)):
+            logger.info(
+                f"wandb is enabled, but option `{arg}` is included in the command line and it is an absolute path. Because the command line is exposed to the public, it is recommended to move it to the `.toml` file or use relative path."
+                + f" / wandbが有効で、かつオプション `{arg}` がコマンドラインに含まれており、絶対パスです。コマンドラインは公開されるため、`.toml`ファイルに移動するか、相対パスを使用することをお勧めします。"
+            )
+
+    if getattr(args, "config_file", None) is not None:
+        logger.info(
+            f"wandb is enabled, but option `config_file` is included in the command line. Because the command line is exposed to the public, please be careful about the information included in the path."
+            + f" / wandbが有効で、かつオプション `config_file` がコマンドラインに含まれています。コマンドラインは公開されるため、パスに含まれる情報にご注意ください。"
+        )
+
+    # other sensitive options
+    if args.huggingface_repo_id is not None and args.huggingface_repo_visibility != "public":
+        logger.info(
+            f"wandb is enabled, but option huggingface_repo_id is included in the command line and huggingface_repo_visibility is not 'public'. Because the command line is exposed to the public, it is recommended to move it to the `.toml` file."
+            + f" / wandbが有効で、かつオプション huggingface_repo_id がコマンドラインに含まれており、huggingface_repo_visibility が 'public' ではありません。コマンドラインは公開されるため、`.toml`ファイルに移動することをお勧めします。"
+        )
+
+
 def verify_training_args(args: argparse.Namespace):
     r"""
     Verify training arguments. Also reflect highvram option to global variable
diff --git a/sdxl_train.py b/sdxl_train.py
index 816598e04..f6d277494 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -812,6 +812,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 9eaaa19f2..e880b57de 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -612,6 +612,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index e55a58896..0ea64b824 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -580,6 +580,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index d33239d92..83969bb1d 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -178,6 +178,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     trainer = SdxlNetworkTrainer()
diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py
index 257d181ad..5df739e28 100644
--- a/sdxl_train_textual_inversion.py
+++ b/sdxl_train_textual_inversion.py
@@ -131,6 +131,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     trainer = SdxlTextualInversionTrainer()
diff --git a/train_controlnet.py b/train_controlnet.py
index 0cb0405fd..90cac0410 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -617,6 +617,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/train_db.py b/train_db.py
index 0a152f224..c3b7339f3 100644
--- a/train_db.py
+++ b/train_db.py
@@ -523,6 +523,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/train_network.py b/train_network.py
index 8fe98f126..fcf4cd9b6 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1101,6 +1101,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     trainer = NetworkTrainer()
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index e7083596f..02edf9525 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -806,6 +806,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     trainer = TextualInversionTrainer()
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 861d48d1d..f0723f2a7 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -714,6 +714,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)

From 089727b5ee40193464d5f81662b523aa7c52bee2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 7 Apr 2024 12:42:49 +0900
Subject: [PATCH 073/132] update readme

---
 README.md | 77 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 87cdf0b76..0cecc5676 100644
--- a/README.md
+++ b/README.md
@@ -139,18 +139,34 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ### Mar XX, 2024 / 2024/3/XX: v0.8.6
 
+#### Highlights
 
 - The dependent libraries are updated. Please see [Upgrade](#upgrade) and update the libraries.
   - Especially `imagesize` is newly added, so if you cannot update the libraries immediately, please install with `pip install imagesize==1.4.1` separately.
   - `bitsandbytes==0.43.0`, `prodigyopt==1.0`, `lion-pytorch==0.0.6` are included in the requirements.txt.
   - Also, the PyTorch version is updated to 2.1.2 (PyTorch does not need to be updated immediately). In the upgrade procedure, PyTorch is not updated, so please manually install or update torch, torchvision, xformers if necessary (see [Upgrade PyTorch](#upgrade-pytorch)).
+- When logging to wandb is enabled, the entire command line is exposed. Therefore, it is recommended to write the API key of wandb and the token of HuggingFace in the configuration file (`.toml`). Thanks to bghira for raising the issue.
+  - A warning is displayed at the start of training if such information is included in the command line.
+  - Also, if there is an absolute path, the path may be exposed, so it is recommended to specify a relative path or write it in the configuration file. In such cases, an INFO log is displayed.
+  - See [#1123](https://github.com/kohya-ss/sd-scripts/pull/1123) and PR [#1240](https://github.com/kohya-ss/sd-scripts/pull/1240) for details.
 - Colab seems to stop with log output. Try specifying `--console_log_simple` option in the training script to disable rich logging.
-- The `.toml` file for the dataset config is now read in UTF-8 encoding. PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Thanks to Horizon1704!
-- Fixed a bug that the last subset settings are applied to all images when multiple subsets of regularization images are specified in the dataset settings. The settings for each subset are correctly applied to each image. PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) Thanks to feffy380!
+- Other improvements include the addition of masked loss, DeepSpeed support, dataset settings improvements, and image tagging improvements. See below for details.
+
+#### Training scripts
+
 - `train_network.py` and `sdxl_train_network.py` are modified to record some dataset settings in the metadata of the trained model (`caption_prefix`, `caption_suffix`, `keep_tokens_separator`, `secondary_separator`, `enable_wildcard`).
 - Fixed a bug that U-Net and Text Encoders are included in the state in `train_network.py` and `sdxl_train_network.py`. The saving and loading of the state are faster, the file size is smaller, and the memory usage when loading is reduced.
 - DeepSpeed is supported. PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101)  and [#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) Thanks to BootsofLagrangian! See PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) for details.
 - The masked loss is supported in each training script. PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) See [Masked loss](#masked-loss) for details.
+- The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
+- The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
+- The options `--sample_every_n_epochs` and `--sample_every_n_steps` in each training script now display a warning and ignore them when a number less than or equal to `0` is specified. Thanks to S-Del for raising the issue.
+
+#### Dataset settings
+
+- The [English version of the dataset settings documentation](./docs/config_README-en.md) is added. PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) Thanks to darkstorm2150!
+- The `.toml` file for the dataset config is now read in UTF-8 encoding. PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Thanks to Horizon1704!
+- Fixed a bug that the last subset settings are applied to all images when multiple subsets of regularization images are specified in the dataset settings. The settings for each subset are correctly applied to each image. PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) Thanks to feffy380!
 - Some features are added to the dataset subset settings.
   - `secondary_separator` is added to specify the tag separator that is not the target of shuffling or dropping. 
     - Specify `secondary_separator=";;;"`. When you specify `secondary_separator`, the part is not shuffled or dropped. 
@@ -159,6 +175,9 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - The existing features `caption_prefix` and `caption_suffix` can be used together. `caption_prefix` and `caption_suffix` are processed first, and then `enable_wildcard`, `keep_tokens_separator`, shuffling and dropping, and `secondary_separator` are processed in order.
   - See [Dataset config](./docs/config_README-en.md) for details.
 - The dataset with DreamBooth method supports caching image information (size, caption). PR [#1178](https://github.com/kohya-ss/sd-scripts/pull/1178) and [#1206](https://github.com/kohya-ss/sd-scripts/pull/1206) Thanks to KohakuBlueleaf! See [DreamBooth method specific options](./docs/config_README-en.md#dreambooth-specific-options) for details.
+
+#### Image tagging
+
 - The support for v3 repositories is added to `tag_image_by_wd14_tagger.py` (`--onnx` option only). PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) Thanks to sdbds!
   - Onnx may need to be updated. Onnx is not installed by default, so please install or update it with `pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` etc. Please also check the comments in `requirements.txt`.
 - The model is now saved in the subdirectory as `--repo_id` in `tag_image_by_wd14_tagger.py` . This caches multiple repo_id models. Please delete unnecessary files under `--model_dir`.
@@ -171,22 +190,43 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - Replace tags `--tag_replacement`
   - See [Tagging documentation](./docs/wd14_tagger_README-en.md) for details.
 - Fixed an error when specifying `--beam_search` and a value of 2 or more for `--num_beams` in `make_captions.py`.
-- The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
-- The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
-- The options `--sample_every_n_epochs` and `--sample_every_n_steps` in each training script now display a warning and ignore them when a number less than or equal to `0` is specified. Thanks to S-Del for raising the issue.
-- The [English version of the dataset settings documentation](./docs/config_README-en.md) is added. PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) Thanks to darkstorm2150!
+
+#### About Masked loss
+
+The masked loss is supported in each training script. To enable the masked loss, specify the `--masked_loss` option.
+
+The feature is not fully tested, so there may be bugs. If you find any issues, please open an Issue.
+
+ControlNet dataset is used to specify the mask. The mask images should be the RGB images. The pixel value 255 in R channel is treated as the mask (the loss is calculated only for the pixels with the mask), and 0 is treated as the non-mask. The pixel values 0-255 are converted to 0-1 (i.e., the pixel value 128 is treated as the half weight of the loss). See details for the dataset specification in the [LLLite documentation](./docs/train_lllite_README.md#preparing-the-dataset).
+
+
+#### 主要な変更点
 
 - 依存ライブラリが更新されました。[アップグレード](./README-ja.md#アップグレード) を参照しライブラリを更新してください。
   - 特に `imagesize` が新しく追加されていますので、すぐにライブラリの更新ができない場合は `pip install imagesize==1.4.1` で個別にインストールしてください。
   - `bitsandbytes==0.43.0`、`prodigyopt==1.0`、`lion-pytorch==0.0.6` が requirements.txt に含まれるようになりました。
   - また PyTorch のバージョンを 2.1.2 に更新しました。PyTorch はすぐに更新する必要はありません。更新時は、アップグレードの手順では PyTorch が更新されませんので、torch、torchvision、xformers を手動でインストールしてください。
+- wandb へのログ出力が有効の場合、コマンドライン全体が公開されます。そのため、コマンドラインに wandb の API キーや HuggingFace のトークンなどが含まれる場合、設定ファイル（`.toml`）への記載をお勧めします。問題提起していただいた bghira 氏に感謝します。
+  - このような場合には学習開始時に警告が表示されます。
+  - また絶対パスの指定がある場合、そのパスが公開される可能性がありますので、相対パスを指定するか設定ファイルに記載することをお勧めします。このような場合は INFO ログが表示されます。
+  - 詳細は [#1123](https://github.com/kohya-ss/sd-scripts/pull/1123) および PR [#1240](https://github.com/kohya-ss/sd-scripts/pull/1240) をご覧ください。
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
-- データセット設定の `.toml` ファイルが UTF-8 encoding で読み込まれるようになりました。PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Horizon1704 氏に感謝します。
-- データセット設定で、正則化画像のサブセットを複数指定した時、最後のサブセットの各種設定がすべてのサブセットの画像に適用される不具合が修正されました。それぞれのサブセットの設定が、それぞれの画像に正しく適用されます。PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) feffy380 氏に感謝します。
+- その他、マスクロス追加、DeepSpeed 対応、データセット設定の改善、画像タグ付けの改善などがあります。詳細は以下をご覧ください。
+
+#### 学習スクリプト
+
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
 - `train_network.py` および `sdxl_train_network.py` で、state に U-Net および Text Encoder が含まれる不具合を修正しました。state の保存、読み込みが高速化され、ファイルサイズも小さくなり、また読み込み時のメモリ使用量も削減されます。
 - DeepSpeed がサポートされました。PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) 、[#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) BootsofLagrangian 氏に感謝します。詳細は PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) をご覧ください。
 - 各学習スクリプトでマスクロスをサポートしました。PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) 詳細は [Masked loss](#masked-loss) をご覧ください。
+- 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
+- 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
+- 各学習スクリプトで `--sample_every_n_epochs` および `--sample_every_n_steps` オプションに `0` 以下の数値を指定した時、警告を表示するとともにそれらを無視するよう変更しました。問題提起していただいた S-Del 氏に感謝します。
+
+#### データセット設定
+
+- データセット設定の `.toml` ファイルが UTF-8 encoding で読み込まれるようになりました。PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Horizon1704 氏に感謝します。
+- データセット設定で、正則化画像のサブセットを複数指定した時、最後のサブセットの各種設定がすべてのサブセットの画像に適用される不具合が修正されました。それぞれのサブセットの設定が、それぞれの画像に正しく適用されます。PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) feffy380 氏に感謝します。
 - データセットのサブセット設定にいくつかの機能を追加しました。
   - シャッフルの対象とならないタグ分割識別子の指定 `secondary_separator` を追加しました。`secondary_separator=";;;"` のように指定します。`secondary_separator` で区切ることで、その部分はシャッフル、drop 時にまとめて扱われます。
   - `enable_wildcard` を追加しました。`true` にするとワイルドカード記法 `{aaa|bbb|ccc}` が使えます。また複数行キャプションも有効になります。
@@ -194,6 +234,10 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - 既存の機能 `caption_prefix` と `caption_suffix` とあわせて使えます。`caption_prefix` と `caption_suffix` は一番最初に処理され、その後、ワイルドカード、`keep_tokens_separator`、シャッフルおよび drop、`secondary_separator` の順に処理されます。
   - 詳細は [データセット設定](./docs/config_README-ja.md) をご覧ください。
 - DreamBooth 方式の DataSet で画像情報（サイズ、キャプション）をキャッシュする機能が追加されました。PR [#1178](https://github.com/kohya-ss/sd-scripts/pull/1178)、[#1206](https://github.com/kohya-ss/sd-scripts/pull/1206) KohakuBlueleaf 氏に感謝します。詳細は [データセット設定](./docs/config_README-ja.md#dreambooth-方式専用のオプション) をご覧ください。
+- データセット設定の[英語版ドキュメント](./docs/config_README-en.md) が追加されました。PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) darkstorm2150 氏に感謝します。
+
+#### 画像のタグ付け
+
 - `tag_image_by_wd14_tagger.py` で v3 のリポジトリがサポートされました（`--onnx` 指定時のみ有効）。 PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) sdbds 氏に感謝します。
   - Onnx のバージョンアップが必要になるかもしれません。デフォルトでは Onnx はインストールされていませんので、`pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` 等でインストール、アップデートしてください。`requirements.txt` のコメントもあわせてご確認ください。
 - `tag_image_by_wd14_tagger.py` で、モデルを`--repo_id` のサブディレクトリに保存するようにしました。これにより複数のモデルファイルがキャッシュされます。`--model_dir` 直下の不要なファイルは削除願います。
@@ -206,21 +250,8 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - タグを置換する `--tag_replacement`
   - 詳細は [タグ付けに関するドキュメント](./docs/wd14_tagger_README-ja.md) をご覧ください。
 - `make_captions.py` で `--beam_search` を指定し `--num_beams` に2以上の値を指定した時のエラーを修正しました。
-- 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
-- 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
-- 各学習スクリプトで `--sample_every_n_epochs` および `--sample_every_n_steps` オプションに `0` 以下の数値を指定した時、警告を表示するとともにそれらを無視するよう変更しました。問題提起していただいた S-Del 氏に感謝します。
-- データセット設定の[英語版ドキュメント](./docs/config_README-en.md) が追加されました。PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) darkstorm2150 氏に感謝します。
-
-Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
-最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
-
-#### Masked loss
 
-The masked loss is supported in each training script. To enable the masked loss, specify the `--masked_loss` option.
-
-The feature is not fully tested, so there may be bugs. If you find any issues, please open an Issue.
-
-ControlNet dataset is used to specify the mask. The mask images should be the RGB images. The pixel value 255 in R channel is treated as the mask (the loss is calculated only for the pixels with the mask), and 0 is treated as the non-mask. The pixel values 0-255 are converted to 0-1 (i.e., the pixel value 128 is treated as the half weight of the loss). See details for the dataset specification in the [LLLite documentation](./docs/train_lllite_README.md#preparing-the-dataset).
+#### マスクロスについて
 
 各学習スクリプトでマスクロスをサポートしました。マスクロスを有効にするには `--masked_loss` オプションを指定してください。
 
@@ -228,6 +259,8 @@ ControlNet dataset is used to specify the mask. The mask images should be the RG
 
 マスクの指定には ControlNet データセットを使用します。マスク画像は RGB 画像である必要があります。R チャンネルのピクセル値 255 がロス計算対象、0 がロス計算対象外になります。0-255 の値は、0-1 の範囲に変換されます（つまりピクセル値 128 の部分はロスの重みが半分になります）。データセットの詳細は [LLLite ドキュメント](./docs/train_lllite_README-ja.md#データセットの準備) をご覧ください。
 
+Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
+最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
 
 ## Additional Information
 

From 90b18795fce516cb00735dc43a6ee76ecae8ec83 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Sun, 7 Apr 2024 07:54:21 +0300
Subject: [PATCH 074/132] Add option to use Scheduled Huber Loss in all
 training pipelines to improve resilience to data corruption (#1228)

* add huber loss and huber_c compute to train_util

* add reduction modes

* add huber_c retrieval from timestep getter

* move get timesteps and huber to own function

* add conditional loss to all training scripts

* add cond loss to train network

* add (scheduled) huber_loss to args

* fixup twice timesteps getting

* PHL-schedule should depend on noise scheduler's num timesteps

* *2 multiplier to huber loss cause of 1/2 a^2 conv.

The Taylor expansion of sqrt near zero gives 1/2 a^2, which differs from a^2 of the standard MSE loss. This change scales them better against one another

* add option for smooth l1 (huber / delta)

* unify huber scheduling

* add snr huber scheduler

---------

Co-authored-by: Kohya S <52813779+kohya-ss@users.noreply.github.com>
---
 fine_tune.py                         |  6 +--
 library/train_util.py                | 79 ++++++++++++++++++++++++++--
 sdxl_train.py                        |  6 +--
 sdxl_train_control_net_lllite.py     |  4 +-
 sdxl_train_control_net_lllite_old.py |  4 +-
 train_controlnet.py                  | 11 ++--
 train_db.py                          |  4 +-
 train_network.py                     |  4 +-
 train_textual_inversion.py           |  4 +-
 train_textual_inversion_XTI.py       |  4 +-
 10 files changed, 96 insertions(+), 30 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 3c4a5a26b..c7e6bbd2e 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -354,7 +354,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -368,7 +368,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                 if args.min_snr_gamma or args.scale_v_pred_loss_like_noise_pred or args.debiased_estimation_loss:
                     # do not mean over batch dimension for snr weight or scale v-pred loss
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                     loss = loss.mean([1, 2, 3])
 
                     if args.min_snr_gamma:
@@ -380,7 +380,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                     loss = loss.mean()  # mean over batch dimension
                 else:
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="mean")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c)
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients and args.max_grad_norm != 0.0:
diff --git a/library/train_util.py b/library/train_util.py
index c13bb68ee..90e6818ad 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3236,6 +3236,26 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         default=None,
         help="set maximum time step for U-Net training (1~1000, default is 1000) / U-Net学習時のtime stepの最大値を設定する（1~1000で指定、省略時はデフォルト値(1000)）",
     )
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber", "smooth_l1"],
+        help="The type of loss to use and whether it's scheduled based on the timestep"
+    )
+    parser.add_argument(
+        "--huber_schedule",
+        type=str,
+        default="exponential",
+        choices=["constant", "exponential", "snr"],
+        help="The type of loss to use and whether it's scheduled based on the timestep"
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.1,
+        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type.",
+    )
 
     parser.add_argument(
         "--lowram",
@@ -4842,6 +4862,38 @@ def save_sd_model_on_train_end_common(
         if args.huggingface_repo_id is not None:
             huggingface_util.upload(args, out_dir, "/" + model_name, force_sync_upload=True)
 
+def get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, device):
+
+    #TODO: if a huber loss is selected, it will use constant timesteps for each batch
+    # as. In the future there may be a smarter way
+
+    if args.loss_type == 'huber' or args.loss_type == 'smooth_l1':
+        timesteps = torch.randint(
+            min_timestep, max_timestep, (1,), device='cpu'
+        )
+        timestep = timesteps.item()
+
+        if args.huber_schedule == "exponential":
+            alpha = - math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+            huber_c = math.exp(-alpha * timestep)
+        elif args.huber_schedule == "snr":
+            alphas_cumprod = noise_scheduler.alphas_cumprod[timestep]
+            sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+            huber_c = (1 - args.huber_c) / (1 + sigmas)**2 + args.huber_c
+        elif args.huber_schedule == "constant":
+            huber_c = args.huber_c
+        else:
+            raise NotImplementedError(f'Unknown Huber loss schedule {args.huber_schedule}!')
+
+        timesteps = timesteps.repeat(b_size).to(device)
+    elif args.loss_type == 'l2':
+        timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device=device)
+        huber_c = 1 # may be anything, as it's not used
+    else:
+        raise NotImplementedError(f'Unknown loss type {args.loss_type}')
+    timesteps = timesteps.long()
+
+    return timesteps, huber_c
 
 def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     # Sample noise that we'll add to the latents
@@ -4862,8 +4914,7 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     min_timestep = 0 if args.min_timestep is None else args.min_timestep
     max_timestep = noise_scheduler.config.num_train_timesteps if args.max_timestep is None else args.max_timestep
 
-    timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device=latents.device)
-    timesteps = timesteps.long()
+    timesteps, huber_c = get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, latents.device)
 
     # Add noise to the latents according to the noise magnitude at each timestep
     # (this is the forward diffusion process)
@@ -4876,8 +4927,28 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     else:
         noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
-    return noise, noisy_latents, timesteps
-
+    return noise, noisy_latents, timesteps, huber_c
+
+# NOTE: if you're using the scheduled version, huber_c has to depend on the timesteps already
+def conditional_loss(model_pred:torch.Tensor, target:torch.Tensor, reduction:str="mean", loss_type:str="l2", huber_c:float=0.1):
+    
+    if loss_type == 'l2':
+        loss = torch.nn.functional.mse_loss(model_pred, target, reduction=reduction)
+    elif loss_type == 'huber':
+        loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    elif loss_type == 'smooth_l1':
+        loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    else:
+        raise NotImplementedError(f'Unsupported Loss Type {loss_type}')
+    return loss
 
 def append_lr_to_logs(logs, lr_scheduler, optimizer_type, including_unet=True):
     names = []
diff --git a/sdxl_train.py b/sdxl_train.py
index f6d277494..46d7860be 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -582,7 +582,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
@@ -600,7 +600,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                     or args.masked_loss
                 ):
                     # do not mean over batch dimension for snr weight or scale v-pred loss
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                     if args.masked_loss:
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
@@ -616,7 +616,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                     loss = loss.mean()  # mean over batch dimension
                 else:
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="mean")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c)
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients and args.max_grad_norm != 0.0:
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index e880b57de..f89c3628f 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -439,7 +439,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
@@ -458,7 +458,7 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index 0ea64b824..e85e978c1 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -406,7 +406,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
@@ -426,7 +426,7 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_controlnet.py b/train_controlnet.py
index 90cac0410..f4c94e8d9 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -420,13 +420,8 @@ def remove_model(old_ckpt_name):
                     )
 
                 # Sample a random timestep for each image
-                timesteps = torch.randint(
-                    0,
-                    noise_scheduler.config.num_train_timesteps,
-                    (b_size,),
-                    device=latents.device,
-                )
-                timesteps = timesteps.long()
+                timesteps, huber_c = train_util.get_timesteps_and_huber_c(args, 0, noise_scheduler.config.num_train_timesteps, noise_scheduler, b_size, latents.device)
+
                 # Add noise to the latents according to the noise magnitude at each timestep
                 # (this is the forward diffusion process)
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
@@ -457,7 +452,7 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_db.py b/train_db.py
index c3b7339f3..1de504ed8 100644
--- a/train_db.py
+++ b/train_db.py
@@ -346,7 +346,7 @@ def train(args):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -358,7 +358,7 @@ def train(args):
                 else:
                     target = noise
 
-                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 if args.masked_loss:
                     loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])
diff --git a/train_network.py b/train_network.py
index fcf4cd9b6..31d89276c 100644
--- a/train_network.py
+++ b/train_network.py
@@ -843,7 +843,7 @@ def remove_model(old_ckpt_name):
 
                     # Sample noise, sample a random timestep for each image, and add noise to the latents,
                     # with noise offset and/or multires noise if specified
-                    noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
+                    noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
                         args, noise_scheduler, latents
                     )
 
@@ -873,7 +873,7 @@ def remove_model(old_ckpt_name):
                     else:
                         target = noise
 
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                     if args.masked_loss:
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 02edf9525..10fce2677 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -572,7 +572,7 @@ def remove_model(old_ckpt_name):
 
                     # Sample noise, sample a random timestep for each image, and add noise to the latents,
                     # with noise offset and/or multires noise if specified
-                    noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
+                    noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
                         args, noise_scheduler, latents
                     )
 
@@ -588,7 +588,7 @@ def remove_model(old_ckpt_name):
                     else:
                         target = noise
 
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                     if args.masked_loss:
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index f0723f2a7..ddd03d532 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -461,7 +461,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -473,7 +473,7 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 if args.masked_loss:
                     loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])

From d30ebb205cbec29010d35222e2f478cf1813e151 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 7 Apr 2024 14:58:17 +0900
Subject: [PATCH 075/132] update readme, add metadata for network module

---
 README.md             | 47 +++++++++++++++++++++++++++++++++++++++----
 library/train_util.py | 45 +++++++++++++++++++++++------------------
 train_network.py      | 11 ++++++++--
 3 files changed, 77 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 0cecc5676..5282c1f69 100644
--- a/README.md
+++ b/README.md
@@ -150,14 +150,15 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - Also, if there is an absolute path, the path may be exposed, so it is recommended to specify a relative path or write it in the configuration file. In such cases, an INFO log is displayed.
   - See [#1123](https://github.com/kohya-ss/sd-scripts/pull/1123) and PR [#1240](https://github.com/kohya-ss/sd-scripts/pull/1240) for details.
 - Colab seems to stop with log output. Try specifying `--console_log_simple` option in the training script to disable rich logging.
-- Other improvements include the addition of masked loss, DeepSpeed support, dataset settings improvements, and image tagging improvements. See below for details.
+- Other improvements include the addition of masked loss, scheduled Huber Loss, DeepSpeed support, dataset settings improvements, and image tagging improvements. See below for details.
 
 #### Training scripts
 
 - `train_network.py` and `sdxl_train_network.py` are modified to record some dataset settings in the metadata of the trained model (`caption_prefix`, `caption_suffix`, `keep_tokens_separator`, `secondary_separator`, `enable_wildcard`).
 - Fixed a bug that U-Net and Text Encoders are included in the state in `train_network.py` and `sdxl_train_network.py`. The saving and loading of the state are faster, the file size is smaller, and the memory usage when loading is reduced.
 - DeepSpeed is supported. PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101)  and [#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) Thanks to BootsofLagrangian! See PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) for details.
-- The masked loss is supported in each training script. PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) See [Masked loss](#masked-loss) for details.
+- The masked loss is supported in each training script. PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) See [Masked loss](#about-masked-loss) for details.
+- Scheduled Huber Loss has been introduced to each training scripts. PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) Thanks to kabachuha for the PR and cheald, drhead, and others for the discussion! See [Scheduled Huber Loss](#about-scheduled-huber-loss) for details.
 - The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
 - The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
 - The options `--sample_every_n_epochs` and `--sample_every_n_steps` in each training script now display a warning and ignore them when a number less than or equal to `0` is specified. Thanks to S-Del for raising the issue.
@@ -199,6 +200,23 @@ The feature is not fully tested, so there may be bugs. If you find any issues, p
 
 ControlNet dataset is used to specify the mask. The mask images should be the RGB images. The pixel value 255 in R channel is treated as the mask (the loss is calculated only for the pixels with the mask), and 0 is treated as the non-mask. The pixel values 0-255 are converted to 0-1 (i.e., the pixel value 128 is treated as the half weight of the loss). See details for the dataset specification in the [LLLite documentation](./docs/train_lllite_README.md#preparing-the-dataset).
 
+#### About Scheduled Huber Loss
+
+Scheduled Huber Loss has been introduced to each training scripts. This is a method to improve robustness against outliers or anomalies (data corruption) in the training data.
+
+With the traditional MSE (L2) loss function, the impact of outliers could be significant, potentially leading to a degradation in the quality of generated images. On the other hand, while the Huber loss function can suppress the influence of outliers, it tends to compromise the reproduction of fine details in images.
+
+To address this, the proposed method employs a clever application of the Huber loss function. By scheduling the use of Huber loss in the early stages of training (when noise is high) and MSE in the later stages, it strikes a balance between outlier robustness and fine detail reproduction.
+
+Experimental results have confirmed that this method achieves higher accuracy on data containing outliers compared to pure Huber loss or MSE. The increase in computational cost is minimal.
+
+The newly added arguments loss_type, huber_schedule, and huber_c allow for the selection of the loss function type (Huber, smooth L1, MSE), scheduling method (exponential, constant, SNR), and Huber's parameter. This enables optimization based on the characteristics of the dataset.
+
+See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
+
+- `loss_type`: Specify the loss function type. Choose `huber` for Huber loss, `smooth_l1` for smooth L1 loss, and `l2` for MSE loss. The default is `l2`, which is the same as before.
+- `huber_schedule`: Specify the scheduling method. Choose `exponential`, `constant`, or `SNR`. The default is `exponential`.
+- `huber_c`: Specify the Huber's parameter. The default is `0.1`.
 
 #### 主要な変更点
 
@@ -211,14 +229,15 @@ ControlNet dataset is used to specify the mask. The mask images should be the RG
   - また絶対パスの指定がある場合、そのパスが公開される可能性がありますので、相対パスを指定するか設定ファイルに記載することをお勧めします。このような場合は INFO ログが表示されます。
   - 詳細は [#1123](https://github.com/kohya-ss/sd-scripts/pull/1123) および PR [#1240](https://github.com/kohya-ss/sd-scripts/pull/1240) をご覧ください。
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
-- その他、マスクロス追加、DeepSpeed 対応、データセット設定の改善、画像タグ付けの改善などがあります。詳細は以下をご覧ください。
+- その他、マスクロス追加、Scheduled Huber Loss 追加、DeepSpeed 対応、データセット設定の改善、画像タグ付けの改善などがあります。詳細は以下をご覧ください。
 
 #### 学習スクリプト
 
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
 - `train_network.py` および `sdxl_train_network.py` で、state に U-Net および Text Encoder が含まれる不具合を修正しました。state の保存、読み込みが高速化され、ファイルサイズも小さくなり、また読み込み時のメモリ使用量も削減されます。
 - DeepSpeed がサポートされました。PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) 、[#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) BootsofLagrangian 氏に感謝します。詳細は PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) をご覧ください。
-- 各学習スクリプトでマスクロスをサポートしました。PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) 詳細は [Masked loss](#masked-loss) をご覧ください。
+- 各学習スクリプトでマスクロスをサポートしました。PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) 詳細は [マスクロスについて](#マスクロスについて) をご覧ください。
+- 各学習スクリプトに Scheduled Huber Loss を追加しました。PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) ご提案いただいた kabachuha 氏、および議論を深めてくださった cheald 氏、drhead 氏を始めとする諸氏に感謝します。詳細は [Scheduled Huber Loss について](#scheduled-huber-loss-について) をご覧ください。
 - 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
 - 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
 - 各学習スクリプトで `--sample_every_n_epochs` および `--sample_every_n_steps` オプションに `0` 以下の数値を指定した時、警告を表示するとともにそれらを無視するよう変更しました。問題提起していただいた S-Del 氏に感謝します。
@@ -262,6 +281,26 @@ ControlNet dataset is used to specify the mask. The mask images should be the RG
 Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
 最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
 
+#### Scheduled Huber Loss について
+
+各学習スクリプトに、学習データ中の異常値や外れ値（data corruption）への耐性を高めるための手法、Scheduled Huber Lossが導入されました。
+
+従来のMSE（L2）損失関数では、異常値の影響を大きく受けてしまい、生成画像の品質低下を招く恐れがありました。一方、Huber損失関数は異常値の影響を抑えられますが、画像の細部再現性が損なわれがちでした。
+
+この手法ではHuber損失関数の適用を工夫し、学習の初期段階（ノイズが大きい場合）ではHuber損失を、後期段階ではMSEを用いるようスケジューリングすることで、異常値耐性と細部再現性のバランスを取ります。
+
+実験の結果では、この手法が純粋なHuber損失やMSEと比べ、異常値を含むデータでより高い精度を達成することが確認されています。また計算コストの増加はわずかです。
+
+具体的には、新たに追加された引数loss_type、huber_schedule、huber_cで、損失関数の種類（Huber, smooth L1, MSE）とスケジューリング方法（exponential, constant, SNR）を選択できます。これによりデータセットに応じた最適化が可能になります。
+
+詳細は PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) をご覧ください。
+
+- `loss_type` : 損失関数の種類を指定します。`huber` で Huber損失、`smooth_l1` で smooth L1 損失、`l2` で MSE 損失を選択します。デフォルトは `l2` で、従来と同様です。
+- `huber_schedule` : スケジューリング方法を指定します。`exponential` で指数関数的、`constant` で一定、`snr` で信号対雑音比に基づくスケジューリングを選択します。デフォルトは `exponential` です。
+- `huber_c` : Huber損失のパラメータを指定します。デフォルトは `0.1` です。
+
+PR 内でいくつかの比較が共有されています。この機能を試す場合、最初は `--loss_type smooth_l1 --huber_schedule snr --huber_c 0.1` などで試してみるとよいかもしれません。
+
 ## Additional Information
 
 ### Naming of LoRA
diff --git a/library/train_util.py b/library/train_util.py
index 90e6818ad..9ce129bd9 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3241,20 +3241,21 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         type=str,
         default="l2",
         choices=["l2", "huber", "smooth_l1"],
-        help="The type of loss to use and whether it's scheduled based on the timestep"
+        help="The type of loss function to use (L2, Huber, or smooth L1), default is L2 / 使用する損失関数の種類（L2、Huber、またはsmooth L1）、デフォルトはL2",
     )
     parser.add_argument(
         "--huber_schedule",
         type=str,
         default="exponential",
         choices=["constant", "exponential", "snr"],
-        help="The type of loss to use and whether it's scheduled based on the timestep"
+        help="The scheduling method for Huber loss (constant, exponential, or SNR-based). Only used when loss_type is 'huber' or 'smooth_l1'. default is exponential"
+        + " / Huber損失のスケジューリング方法（constant、exponential、またはSNRベース）。loss_typeが'huber'または'smooth_l1'の場合に有効、デフォルトはexponential",
     )
     parser.add_argument(
         "--huber_c",
         type=float,
         default=0.1,
-        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type.",
+        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type. default is 0.1 / Huber損失のパラメータ。loss_typeがhuberまたはsmooth l1の場合に有効。デフォルトは0.1",
     )
 
     parser.add_argument(
@@ -4862,39 +4863,39 @@ def save_sd_model_on_train_end_common(
         if args.huggingface_repo_id is not None:
             huggingface_util.upload(args, out_dir, "/" + model_name, force_sync_upload=True)
 
+
 def get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, device):
 
-    #TODO: if a huber loss is selected, it will use constant timesteps for each batch
+    # TODO: if a huber loss is selected, it will use constant timesteps for each batch
     # as. In the future there may be a smarter way
 
-    if args.loss_type == 'huber' or args.loss_type == 'smooth_l1':
-        timesteps = torch.randint(
-            min_timestep, max_timestep, (1,), device='cpu'
-        )
+    if args.loss_type == "huber" or args.loss_type == "smooth_l1":
+        timesteps = torch.randint(min_timestep, max_timestep, (1,), device="cpu")
         timestep = timesteps.item()
 
         if args.huber_schedule == "exponential":
-            alpha = - math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+            alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
             huber_c = math.exp(-alpha * timestep)
         elif args.huber_schedule == "snr":
             alphas_cumprod = noise_scheduler.alphas_cumprod[timestep]
             sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
-            huber_c = (1 - args.huber_c) / (1 + sigmas)**2 + args.huber_c
+            huber_c = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
         elif args.huber_schedule == "constant":
             huber_c = args.huber_c
         else:
-            raise NotImplementedError(f'Unknown Huber loss schedule {args.huber_schedule}!')
+            raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
 
         timesteps = timesteps.repeat(b_size).to(device)
-    elif args.loss_type == 'l2':
+    elif args.loss_type == "l2":
         timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device=device)
-        huber_c = 1 # may be anything, as it's not used
+        huber_c = 1  # may be anything, as it's not used
     else:
-        raise NotImplementedError(f'Unknown loss type {args.loss_type}')
+        raise NotImplementedError(f"Unknown loss type {args.loss_type}")
     timesteps = timesteps.long()
 
     return timesteps, huber_c
 
+
 def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     # Sample noise that we'll add to the latents
     noise = torch.randn_like(latents, device=latents.device)
@@ -4929,27 +4930,31 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
 
     return noise, noisy_latents, timesteps, huber_c
 
+
 # NOTE: if you're using the scheduled version, huber_c has to depend on the timesteps already
-def conditional_loss(model_pred:torch.Tensor, target:torch.Tensor, reduction:str="mean", loss_type:str="l2", huber_c:float=0.1):
-    
-    if loss_type == 'l2':
+def conditional_loss(
+    model_pred: torch.Tensor, target: torch.Tensor, reduction: str = "mean", loss_type: str = "l2", huber_c: float = 0.1
+):
+
+    if loss_type == "l2":
         loss = torch.nn.functional.mse_loss(model_pred, target, reduction=reduction)
-    elif loss_type == 'huber':
+    elif loss_type == "huber":
         loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
         if reduction == "mean":
             loss = torch.mean(loss)
         elif reduction == "sum":
             loss = torch.sum(loss)
-    elif loss_type == 'smooth_l1':
+    elif loss_type == "smooth_l1":
         loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
         if reduction == "mean":
             loss = torch.mean(loss)
         elif reduction == "sum":
             loss = torch.sum(loss)
     else:
-        raise NotImplementedError(f'Unsupported Loss Type {loss_type}')
+        raise NotImplementedError(f"Unsupported Loss Type {loss_type}")
     return loss
 
+
 def append_lr_to_logs(logs, lr_scheduler, optimizer_type, including_unet=True):
     names = []
     if including_unet:
diff --git a/train_network.py b/train_network.py
index 31d89276c..c99d37247 100644
--- a/train_network.py
+++ b/train_network.py
@@ -476,7 +476,7 @@ def save_model_hook(models, weights, output_dir):
             # pop weights of other models than network to save only network weights
             if accelerator.is_main_process:
                 remove_indices = []
-                for i,model in enumerate(models):
+                for i, model in enumerate(models):
                     if not isinstance(model, type(accelerator.unwrap_model(network))):
                         remove_indices.append(i)
                 for i in reversed(remove_indices):
@@ -569,6 +569,11 @@ def load_model_hook(models, input_dir):
             "ss_scale_weight_norms": args.scale_weight_norms,
             "ss_ip_noise_gamma": args.ip_noise_gamma,
             "ss_debiased_estimation": bool(args.debiased_estimation_loss),
+            "ss_noise_offset_random_strength": args.noise_offset_random_strength,
+            "ss_ip_noise_gamma_random_strength": args.ip_noise_gamma_random_strength,
+            "ss_loss_type": args.loss_type,
+            "ss_huber_schedule": args.huber_schedule,
+            "ss_huber_c": args.huber_c,
         }
 
         if use_user_config:
@@ -873,7 +878,9 @@ def remove_model(old_ckpt_name):
                     else:
                         target = noise
 
-                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
+                    loss = train_util.conditional_loss(
+                        noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    )
                     if args.masked_loss:
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])

From dfa30790a99754f2f20e4980e91a000cce51d8a4 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 7 Apr 2024 20:34:26 +0900
Subject: [PATCH 076/132] update readme

---
 README-ja.md | 44 ++++++++++++++++++++++++++++++++++++++++++
 README.md    | 54 ++++++++++------------------------------------------
 2 files changed, 54 insertions(+), 44 deletions(-)

diff --git a/README-ja.md b/README-ja.md
index f70f882d7..4ae6b2334 100644
--- a/README-ja.md
+++ b/README-ja.md
@@ -111,3 +111,47 @@ Conv2d 3x3への拡大は [cloneofsimo氏](https://github.com/cloneofsimo/lora)
 
 [BLIP](https://github.com/salesforce/BLIP): BSD-3-Clause
 
+## その他の情報
+
+### LoRAの名称について
+
+`train_network.py` がサポートするLoRAについて、混乱を避けるため名前を付けました。ドキュメントは更新済みです。以下は当リポジトリ内の独自の名称です。
+
+1. __LoRA-LierLa__ : (LoRA for __Li__ n __e__ a __r__  __La__ yers、リエラと読みます)
+
+    Linear 層およびカーネルサイズ 1x1 の Conv2d 層に適用されるLoRA
+
+2. __LoRA-C3Lier__ : (LoRA for __C__ olutional layers with __3__ x3 Kernel and  __Li__ n __e__ a __r__ layers、セリアと読みます)
+
+    1.に加え、カーネルサイズ 3x3 の Conv2d 層に適用されるLoRA
+
+デフォルトではLoRA-LierLaが使われます。LoRA-C3Lierを使う場合は `--network_args` に `conv_dim` を指定してください。
+
+<!-- 
+LoRA-LierLa は[Web UI向け拡張](https://github.com/kohya-ss/sd-webui-additional-networks)、またはAUTOMATIC1111氏のWeb UIのLoRA機能で使用することができます。
+
+LoRA-C3Lierを使いWeb UIで生成するには拡張を使用してください。
+-->
+
+### 学習中のサンプル画像生成
+
+プロンプトファイルは例えば以下のようになります。
+
+```
+# prompt 1
+masterpiece, best quality, (1girl), in white shirts, upper body, looking at viewer, simple background --n low quality, worst quality, bad anatomy,bad composition, poor, low effort --w 768 --h 768 --d 1 --l 7.5 --s 28
+
+# prompt 2
+masterpiece, best quality, 1boy, in business suit, standing at street, looking back --n (low quality, worst quality), bad anatomy,bad composition, poor, low effort --w 576 --h 832 --d 2 --l 5.5 --s 40
+```
+
+  `#` で始まる行はコメントになります。`--n` のように「ハイフン二個＋英小文字」の形でオプションを指定できます。以下が使用可能できます。
+
+  * `--n` Negative prompt up to the next option.
+  * `--w` Specifies the width of the generated image.
+  * `--h` Specifies the height of the generated image.
+  * `--d` Specifies the seed of the generated image.
+  * `--l` Specifies the CFG scale of the generated image.
+  * `--s` Specifies the number of steps in the generation.
+
+  `( )` や `[ ]` などの重みづけも動作します。
diff --git a/README.md b/README.md
index 5282c1f69..1ca699be5 100644
--- a/README.md
+++ b/README.md
@@ -137,15 +137,16 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ## Change History
 
-### Mar XX, 2024 / 2024/3/XX: v0.8.6
+### Apr 7, 2024 / 2024-04-07: v0.8.6
 
 #### Highlights
 
 - The dependent libraries are updated. Please see [Upgrade](#upgrade) and update the libraries.
   - Especially `imagesize` is newly added, so if you cannot update the libraries immediately, please install with `pip install imagesize==1.4.1` separately.
   - `bitsandbytes==0.43.0`, `prodigyopt==1.0`, `lion-pytorch==0.0.6` are included in the requirements.txt.
+    - `bitsandbytes` no longer requires complex procedures as it now officially supports Windows.  
   - Also, the PyTorch version is updated to 2.1.2 (PyTorch does not need to be updated immediately). In the upgrade procedure, PyTorch is not updated, so please manually install or update torch, torchvision, xformers if necessary (see [Upgrade PyTorch](#upgrade-pytorch)).
-- When logging to wandb is enabled, the entire command line is exposed. Therefore, it is recommended to write the API key of wandb and the token of HuggingFace in the configuration file (`.toml`). Thanks to bghira for raising the issue.
+- When logging to wandb is enabled, the entire command line is exposed. Therefore, it is recommended to write wandb API key and HuggingFace token in the configuration file (`.toml`). Thanks to bghira for raising the issue.
   - A warning is displayed at the start of training if such information is included in the command line.
   - Also, if there is an absolute path, the path may be exposed, so it is recommended to specify a relative path or write it in the configuration file. In such cases, an INFO log is displayed.
   - See [#1123](https://github.com/kohya-ss/sd-scripts/pull/1123) and PR [#1240](https://github.com/kohya-ss/sd-scripts/pull/1240) for details.
@@ -223,6 +224,7 @@ See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
 - 依存ライブラリが更新されました。[アップグレード](./README-ja.md#アップグレード) を参照しライブラリを更新してください。
   - 特に `imagesize` が新しく追加されていますので、すぐにライブラリの更新ができない場合は `pip install imagesize==1.4.1` で個別にインストールしてください。
   - `bitsandbytes==0.43.0`、`prodigyopt==1.0`、`lion-pytorch==0.0.6` が requirements.txt に含まれるようになりました。
+    - `bitsandbytes` が公式に Windows をサポートしたため複雑な手順が不要になりました。
   - また PyTorch のバージョンを 2.1.2 に更新しました。PyTorch はすぐに更新する必要はありません。更新時は、アップグレードの手順では PyTorch が更新されませんので、torch、torchvision、xformers を手動でインストールしてください。
 - wandb へのログ出力が有効の場合、コマンドライン全体が公開されます。そのため、コマンドラインに wandb の API キーや HuggingFace のトークンなどが含まれる場合、設定ファイル（`.toml`）への記載をお勧めします。問題提起していただいた bghira 氏に感謝します。
   - このような場合には学習開始時に警告が表示されます。
@@ -315,27 +317,14 @@ The LoRA supported by `train_network.py` has been named to avoid confusion. The
 
     In addition to 1., LoRA for Conv2d layers with 3x3 kernel 
     
-LoRA-LierLa is the default LoRA type for `train_network.py` (without `conv_dim` network arg). LoRA-LierLa can be used with [our extension](https://github.com/kohya-ss/sd-webui-additional-networks) for AUTOMATIC1111's Web UI, or with the built-in LoRA feature of the Web UI.
-
-To use LoRA-C3Lier with Web UI, please use our extension.
-
-### LoRAの名称について
-
-`train_network.py` がサポートするLoRAについて、混乱を避けるため名前を付けました。ドキュメントは更新済みです。以下は当リポジトリ内の独自の名称です。
-
-1. __LoRA-LierLa__ : (LoRA for __Li__ n __e__ a __r__  __La__ yers、リエラと読みます)
-
-    Linear 層およびカーネルサイズ 1x1 の Conv2d 層に適用されるLoRA
-
-2. __LoRA-C3Lier__ : (LoRA for __C__ olutional layers with __3__ x3 Kernel and  __Li__ n __e__ a __r__ layers、セリアと読みます)
-
-    1.に加え、カーネルサイズ 3x3 の Conv2d 層に適用されるLoRA
-
-LoRA-LierLa は[Web UI向け拡張](https://github.com/kohya-ss/sd-webui-additional-networks)、またはAUTOMATIC1111氏のWeb UIのLoRA機能で使用することができます。
+LoRA-LierLa is the default LoRA type for `train_network.py` (without `conv_dim` network arg). 
+<!-- 
+LoRA-LierLa can be used with [our extension](https://github.com/kohya-ss/sd-webui-additional-networks) for AUTOMATIC1111's Web UI, or with the built-in LoRA feature of the Web UI.
 
-LoRA-C3Lierを使いWeb UIで生成するには拡張を使用してください。
+To use LoRA-C3Lier with Web UI, please use our extension. 
+-->
 
-## Sample image generation during training
+### Sample image generation during training
   A prompt file might look like this, for example
 
 ```
@@ -356,26 +345,3 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
   * `--s` Specifies the number of steps in the generation.
 
   The prompt weighting such as `( )` and `[ ]` are working.
-
-## サンプル画像生成
-プロンプトファイルは例えば以下のようになります。
-
-```
-# prompt 1
-masterpiece, best quality, (1girl), in white shirts, upper body, looking at viewer, simple background --n low quality, worst quality, bad anatomy,bad composition, poor, low effort --w 768 --h 768 --d 1 --l 7.5 --s 28
-
-# prompt 2
-masterpiece, best quality, 1boy, in business suit, standing at street, looking back --n (low quality, worst quality), bad anatomy,bad composition, poor, low effort --w 576 --h 832 --d 2 --l 5.5 --s 40
-```
-
-  `#` で始まる行はコメントになります。`--n` のように「ハイフン二個＋英小文字」の形でオプションを指定できます。以下が使用可能できます。
-
-  * `--n` Negative prompt up to the next option.
-  * `--w` Specifies the width of the generated image.
-  * `--h` Specifies the height of the generated image.
-  * `--d` Specifies the seed of the generated image.
-  * `--l` Specifies the CFG scale of the generated image.
-  * `--s` Specifies the number of steps in the generation.
-
-  `( )` や `[ ]` などの重みづけも動作します。
-

From c973b29da422911893f62b8acbe7c455f0c8c78b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 7 Apr 2024 20:51:52 +0900
Subject: [PATCH 077/132] update readme

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 1ca699be5..83fa81e03 100644
--- a/README.md
+++ b/README.md
@@ -159,7 +159,7 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 - Fixed a bug that U-Net and Text Encoders are included in the state in `train_network.py` and `sdxl_train_network.py`. The saving and loading of the state are faster, the file size is smaller, and the memory usage when loading is reduced.
 - DeepSpeed is supported. PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101)  and [#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) Thanks to BootsofLagrangian! See PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) for details.
 - The masked loss is supported in each training script. PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) See [Masked loss](#about-masked-loss) for details.
-- Scheduled Huber Loss has been introduced to each training scripts. PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) Thanks to kabachuha for the PR and cheald, drhead, and others for the discussion! See [Scheduled Huber Loss](#about-scheduled-huber-loss) for details.
+- Scheduled Huber Loss has been introduced to each training scripts. PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) Thanks to kabachuha for the PR and cheald, drhead, and others for the discussion! See the PR and [Scheduled Huber Loss](#about-scheduled-huber-loss) for details.
 - The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
 - The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
 - The options `--sample_every_n_epochs` and `--sample_every_n_steps` in each training script now display a warning and ignore them when a number less than or equal to `0` is specified. Thanks to S-Del for raising the issue.
@@ -219,6 +219,8 @@ See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
 - `huber_schedule`: Specify the scheduling method. Choose `exponential`, `constant`, or `SNR`. The default is `exponential`.
 - `huber_c`: Specify the Huber's parameter. The default is `0.1`.
 
+Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
+
 #### 主要な変更点
 
 - 依存ライブラリが更新されました。[アップグレード](./README-ja.md#アップグレード) を参照しライブラリを更新してください。
@@ -239,7 +241,7 @@ See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
 - `train_network.py` および `sdxl_train_network.py` で、state に U-Net および Text Encoder が含まれる不具合を修正しました。state の保存、読み込みが高速化され、ファイルサイズも小さくなり、また読み込み時のメモリ使用量も削減されます。
 - DeepSpeed がサポートされました。PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) 、[#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) BootsofLagrangian 氏に感謝します。詳細は PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) をご覧ください。
 - 各学習スクリプトでマスクロスをサポートしました。PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) 詳細は [マスクロスについて](#マスクロスについて) をご覧ください。
-- 各学習スクリプトに Scheduled Huber Loss を追加しました。PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) ご提案いただいた kabachuha 氏、および議論を深めてくださった cheald 氏、drhead 氏を始めとする諸氏に感謝します。詳細は [Scheduled Huber Loss について](#scheduled-huber-loss-について) をご覧ください。
+- 各学習スクリプトに Scheduled Huber Loss を追加しました。PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) ご提案いただいた kabachuha 氏、および議論を深めてくださった cheald 氏、drhead 氏を始めとする諸氏に感謝します。詳細は当該 PR および [Scheduled Huber Loss について](#scheduled-huber-loss-について) をご覧ください。
 - 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
 - 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
 - 各学習スクリプトで `--sample_every_n_epochs` および `--sample_every_n_steps` オプションに `0` 以下の数値を指定した時、警告を表示するとともにそれらを無視するよう変更しました。問題提起していただいた S-Del 氏に感謝します。
@@ -280,9 +282,6 @@ See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
 
 マスクの指定には ControlNet データセットを使用します。マスク画像は RGB 画像である必要があります。R チャンネルのピクセル値 255 がロス計算対象、0 がロス計算対象外になります。0-255 の値は、0-1 の範囲に変換されます（つまりピクセル値 128 の部分はロスの重みが半分になります）。データセットの詳細は [LLLite ドキュメント](./docs/train_lllite_README-ja.md#データセットの準備) をご覧ください。
 
-Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
-最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
-
 #### Scheduled Huber Loss について
 
 各学習スクリプトに、学習データ中の異常値や外れ値（data corruption）への耐性を高めるための手法、Scheduled Huber Lossが導入されました。
@@ -303,6 +302,8 @@ Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for rece
 
 PR 内でいくつかの比較が共有されています。この機能を試す場合、最初は `--loss_type smooth_l1 --huber_schedule snr --huber_c 0.1` などで試してみるとよいかもしれません。
 
+最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
+
 ## Additional Information
 
 ### Naming of LoRA

From bfb352bc433326a77aca3124248331eb60c49e8c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 7 Apr 2024 21:07:52 +0900
Subject: [PATCH 078/132] change huber_schedule from `exponential` to `snr`

---
 README.md             | 10 ++++++++--
 library/train_util.py |  6 +++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 83fa81e03..a7047a360 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,12 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ## Change History
 
+### Apr 7, 2024 / 2024-04-07: v0.8.7
+
+- The default value of `huber_schedule` in Scheduled Huber Loss is changed from `exponential` to `snr`, which is expected to give better results.
+
+- Scheduled Huber Loss の `huber_schedule` のデフォルト値を `exponential` から、より良い結果が期待できる `snr` に変更しました。
+
 ### Apr 7, 2024 / 2024-04-07: v0.8.6
 
 #### Highlights
@@ -216,7 +222,7 @@ The newly added arguments loss_type, huber_schedule, and huber_c allow for the s
 See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
 
 - `loss_type`: Specify the loss function type. Choose `huber` for Huber loss, `smooth_l1` for smooth L1 loss, and `l2` for MSE loss. The default is `l2`, which is the same as before.
-- `huber_schedule`: Specify the scheduling method. Choose `exponential`, `constant`, or `SNR`. The default is `exponential`.
+- `huber_schedule`: Specify the scheduling method. Choose `exponential`, `constant`, or `snr`. The default is `snr`.
 - `huber_c`: Specify the Huber's parameter. The default is `0.1`.
 
 Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
@@ -297,7 +303,7 @@ Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for rece
 詳細は PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) をご覧ください。
 
 - `loss_type` : 損失関数の種類を指定します。`huber` で Huber損失、`smooth_l1` で smooth L1 損失、`l2` で MSE 損失を選択します。デフォルトは `l2` で、従来と同様です。
-- `huber_schedule` : スケジューリング方法を指定します。`exponential` で指数関数的、`constant` で一定、`snr` で信号対雑音比に基づくスケジューリングを選択します。デフォルトは `exponential` です。
+- `huber_schedule` : スケジューリング方法を指定します。`exponential` で指数関数的、`constant` で一定、`snr` で信号対雑音比に基づくスケジューリングを選択します。デフォルトは `snr` です。
 - `huber_c` : Huber損失のパラメータを指定します。デフォルトは `0.1` です。
 
 PR 内でいくつかの比較が共有されています。この機能を試す場合、最初は `--loss_type smooth_l1 --huber_schedule snr --huber_c 0.1` などで試してみるとよいかもしれません。
diff --git a/library/train_util.py b/library/train_util.py
index 9ce129bd9..15c23f3cc 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3246,10 +3246,10 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
     parser.add_argument(
         "--huber_schedule",
         type=str,
-        default="exponential",
+        default="snr",
         choices=["constant", "exponential", "snr"],
-        help="The scheduling method for Huber loss (constant, exponential, or SNR-based). Only used when loss_type is 'huber' or 'smooth_l1'. default is exponential"
-        + " / Huber損失のスケジューリング方法（constant、exponential、またはSNRベース）。loss_typeが'huber'または'smooth_l1'の場合に有効、デフォルトはexponential",
+        help="The scheduling method for Huber loss (constant, exponential, or SNR-based). Only used when loss_type is 'huber' or 'smooth_l1'. default is snr"
+        + " / Huber損失のスケジューリング方法（constant、exponential、またはSNRベース）。loss_typeが'huber'または'smooth_l1'の場合に有効、デフォルトは snr",
     )
     parser.add_argument(
         "--huber_c",

From 75833e84a1c7e3c2fb0a9e3ce0fe3d8c1758a012 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Mon, 8 Apr 2024 19:23:02 -0400
Subject: [PATCH 079/132] Fix default LR, Add overall LoRA+ ratio, Add log

`--loraplus_ratio` added for both TE and UNet
Add log for lora+
---
 library/train_util.py |  1 +
 networks/dylora.py    | 24 ++++++-------
 networks/lora.py      | 28 ++++++++--------
 networks/lora_fa.py   | 30 ++++++++---------
 train_network.py      | 78 ++++++++++++++++++++++++++++++++-----------
 5 files changed, 101 insertions(+), 60 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 4e5ab7370..7c2bf6935 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2789,6 +2789,7 @@ def add_optimizer_arguments(parser: argparse.ArgumentParser):
         default=1,
         help="Polynomial power for polynomial scheduler / polynomialスケジューラでのpolynomial power",
     )
+    parser.add_argument("--loraplus_lr_ratio", default=None, type=float, help="LoRA+ learning rate ratio")
     parser.add_argument("--loraplus_unet_lr_ratio", default=None, type=float, help="LoRA+ UNet learning rate ratio")
     parser.add_argument("--loraplus_text_encoder_lr_ratio", default=None, type=float, help="LoRA+ text encoder learning rate ratio")
 
diff --git a/networks/dylora.py b/networks/dylora.py
index edc3e2229..dc5c7cb35 100644
--- a/networks/dylora.py
+++ b/networks/dylora.py
@@ -412,32 +412,32 @@ def prepare_optimizer_params(
         text_encoder_lr,
         unet_lr,
         default_lr,
-        unet_lora_plus_ratio=None,
-        text_encoder_lora_plus_ratio=None
+        unet_loraplus_ratio=None,
+        text_encoder_loraplus_ratio=None,
+        loraplus_ratio=None
     ):
         self.requires_grad_(True)
         all_params = []
 
-        def assemble_params(loras, lr, lora_plus_ratio):
+        def assemble_params(loras, lr, ratio):
             param_groups = {"lora": {}, "plus": {}}
             for lora in loras:
                 for name, param in lora.named_parameters():
-                    if lora_plus_ratio is not None and "lora_up" in name:
+                    if ratio is not None and "lora_B" in name:
                         param_groups["plus"][f"{lora.lora_name}.{name}"] = param
                     else:
                         param_groups["lora"][f"{lora.lora_name}.{name}"] = param
 
-            # assigned_param_groups = ""
-            # for group in param_groups:
-            #     assigned_param_groups += f"{group}\n {list(param_groups[group].keys())}\n\n"
-            # logger.info(assigned_param_groups)
-
             params = []
             for key in param_groups.keys():
                 param_data = {"params": param_groups[key].values()}
+
+                if len(param_data["params"]) == 0:
+                    continue
+
                 if lr is not None:
                     if key == "plus":
-                        param_data["lr"] = lr * lora_plus_ratio
+                        param_data["lr"] = lr * ratio
                     else:
                         param_data["lr"] = lr
 
@@ -452,7 +452,7 @@ def assemble_params(loras, lr, lora_plus_ratio):
             params = assemble_params(
                 self.text_encoder_loras,
                 text_encoder_lr if text_encoder_lr is not None else default_lr,
-                text_encoder_lora_plus_ratio
+                text_encoder_loraplus_ratio or loraplus_ratio
             )
             all_params.extend(params)
 
@@ -460,7 +460,7 @@ def assemble_params(loras, lr, lora_plus_ratio):
             params = assemble_params(
                 self.unet_loras,
                 default_lr if unet_lr is None else unet_lr,
-                unet_lora_plus_ratio
+                unet_loraplus_ratio or loraplus_ratio
             )
             all_params.extend(params)
 
diff --git a/networks/lora.py b/networks/lora.py
index e082941e5..6cb05bcb0 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -1040,32 +1040,32 @@ def prepare_optimizer_params(
         text_encoder_lr,
         unet_lr,
         default_lr,
-        unet_lora_plus_ratio=None,
-        text_encoder_lora_plus_ratio=None
+        unet_loraplus_ratio=None,
+        text_encoder_loraplus_ratio=None,
+        loraplus_ratio=None
     ):
         self.requires_grad_(True)
         all_params = []
 
-        def assemble_params(loras, lr, lora_plus_ratio):
+        def assemble_params(loras, lr, ratio):
             param_groups = {"lora": {}, "plus": {}}
             for lora in loras:
                 for name, param in lora.named_parameters():
-                    if lora_plus_ratio is not None and "lora_up" in name:
+                    if ratio is not None and "lora_up" in name:
                         param_groups["plus"][f"{lora.lora_name}.{name}"] = param
                     else:
                         param_groups["lora"][f"{lora.lora_name}.{name}"] = param
 
-            # assigned_param_groups = ""
-            # for group in param_groups:
-            #     assigned_param_groups += f"{group}\n {list(param_groups[group].keys())}\n\n"
-            # logger.info(assigned_param_groups)
-
             params = []
             for key in param_groups.keys():
                 param_data = {"params": param_groups[key].values()}
+
+                if len(param_data["params"]) == 0:
+                    continue
+
                 if lr is not None:
                     if key == "plus":
-                        param_data["lr"] = lr * lora_plus_ratio
+                        param_data["lr"] = lr * ratio
                     else:
                         param_data["lr"] = lr
 
@@ -1080,7 +1080,7 @@ def assemble_params(loras, lr, lora_plus_ratio):
             params = assemble_params(
                 self.text_encoder_loras,
                 text_encoder_lr if text_encoder_lr is not None else default_lr,
-                text_encoder_lora_plus_ratio
+                text_encoder_loraplus_ratio or loraplus_ratio
             )
             all_params.extend(params)
 
@@ -1099,15 +1099,15 @@ def assemble_params(loras, lr, lora_plus_ratio):
                     params = assemble_params(
                         block_loras,
                         (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(block_loras[0]),
-                        unet_lora_plus_ratio
+                        unet_loraplus_ratio or loraplus_ratio
                     )
                     all_params.extend(params)
 
             else:
                 params = assemble_params(
                     self.unet_loras,
-                    default_lr if unet_lr is None else unet_lr,
-                    unet_lora_plus_ratio
+                    unet_lr if unet_lr is not None else default_lr,
+                    unet_loraplus_ratio or loraplus_ratio
                 )
                 all_params.extend(params)
 
diff --git a/networks/lora_fa.py b/networks/lora_fa.py
index 3f6774dd8..2eff86d6c 100644
--- a/networks/lora_fa.py
+++ b/networks/lora_fa.py
@@ -1038,32 +1038,32 @@ def prepare_optimizer_params(
         text_encoder_lr,
         unet_lr,
         default_lr,
-        unet_lora_plus_ratio=None,
-        text_encoder_lora_plus_ratio=None
+        unet_loraplus_ratio=None,
+        text_encoder_loraplus_ratio=None,
+        loraplus_ratio=None
     ):
         self.requires_grad_(True)
         all_params = []
 
-        def assemble_params(loras: List[LoRAModule], lr, lora_plus_ratio):
+        def assemble_params(loras, lr, ratio):
             param_groups = {"lora": {}, "plus": {}}
             for lora in loras:
-                for name, param in lora.get_trainable_named_params():
-                    if lora_plus_ratio is not None and "lora_up" in name:
+                for name, param in lora.named_parameters():
+                    if ratio is not None and "lora_up" in name:
                         param_groups["plus"][f"{lora.lora_name}.{name}"] = param
                     else:
                         param_groups["lora"][f"{lora.lora_name}.{name}"] = param
 
-            # assigned_param_groups = ""
-            # for group in param_groups:
-            #     assigned_param_groups += f"{group}\n {list(param_groups[group].keys())}\n\n"
-            # logger.info(assigned_param_groups)
-
             params = []
             for key in param_groups.keys():
                 param_data = {"params": param_groups[key].values()}
+
+                if len(param_data["params"]) == 0:
+                    continue
+
                 if lr is not None:
                     if key == "plus":
-                        param_data["lr"] = lr * lora_plus_ratio
+                        param_data["lr"] = lr * ratio
                     else:
                         param_data["lr"] = lr
 
@@ -1078,7 +1078,7 @@ def assemble_params(loras: List[LoRAModule], lr, lora_plus_ratio):
             params = assemble_params(
                 self.text_encoder_loras,
                 text_encoder_lr if text_encoder_lr is not None else default_lr,
-                text_encoder_lora_plus_ratio
+                text_encoder_loraplus_ratio or loraplus_ratio
             )
             all_params.extend(params)
 
@@ -1097,15 +1097,15 @@ def assemble_params(loras: List[LoRAModule], lr, lora_plus_ratio):
                     params = assemble_params(
                         block_loras,
                         (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(block_loras[0]),
-                        unet_lora_plus_ratio
+                        unet_loraplus_ratio or loraplus_ratio
                     )
                     all_params.extend(params)
 
             else:
                 params = assemble_params(
                     self.unet_loras,
-                    default_lr if unet_lr is None else unet_lr,
-                    unet_lora_plus_ratio
+                    unet_lr if unet_lr is not None else default_lr,
+                    unet_loraplus_ratio or loraplus_ratio
                 )
                 all_params.extend(params)
 
diff --git a/train_network.py b/train_network.py
index ba0c124d1..43226fc47 100644
--- a/train_network.py
+++ b/train_network.py
@@ -66,34 +66,69 @@ def generate_step_logs(
 
         lrs = lr_scheduler.get_last_lr()
 
-        if args.network_train_text_encoder_only or len(lrs) <= 2:  # not block lr (or single block)
-            if args.network_train_unet_only:
-                logs["lr/unet"] = float(lrs[0])
-            elif args.network_train_text_encoder_only:
-                logs["lr/textencoder"] = float(lrs[0])
-            else:
-                logs["lr/textencoder"] = float(lrs[0])
-                logs["lr/unet"] = float(lrs[-1])  # may be same to textencoder
-
-            if (
-                args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower()
-            ):  # tracking d*lr value of unet.
-                logs["lr/d*lr"] = (
-                    lr_scheduler.optimizers[-1].param_groups[0]["d"] * lr_scheduler.optimizers[-1].param_groups[0]["lr"]
-                )
-        else:
+        if len(lrs) > 4:
             idx = 0
             if not args.network_train_unet_only:
                 logs["lr/textencoder"] = float(lrs[0])
                 idx = 1
 
             for i in range(idx, len(lrs)):
-                logs[f"lr/group{i}"] = float(lrs[i])
+                lora_plus = ""
+                group_id = i
+
+                if args.loraplus_lr_ratio is not None or args.loraplus_unet_lr_ratio is not None:
+                    lora_plus = '_lora+' if i % 2 == 1 else '' 
+                    group_id = int((i / 2) + (i % 2 + 0.5))
+
+                logs[f"lr/group{group_id}{lora_plus}"] = float(lrs[i])
                 if args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower():
-                    logs[f"lr/d*lr/group{i}"] = (
+                    logs[f"lr/d*lr/group{group_id}{lora_plus}"] = (
                         lr_scheduler.optimizers[-1].param_groups[i]["d"] * lr_scheduler.optimizers[-1].param_groups[i]["lr"]
                     )
 
+        else:
+            if args.network_train_text_encoder_only:
+                if args.loraplus_lr_ratio is not None or  args.loraplus_text_encoder_lr_ratio is not None:
+                    logs["lr/textencoder"] = float(lrs[0])
+                    logs["lr/textencoder_lora+"] = float(lrs[1])
+                else:
+                    logs["lr/textencoder"] = float(lrs[0])
+                
+            elif args.network_train_unet_only:
+                if args.loraplus_lr_ratio is not None or args.loraplus_unet_lr_ratio is not None:
+                    logs["lr/unet"] = float(lrs[0])
+                    logs["lr/unet_lora+"] = float(lrs[1])
+                else:
+                    logs["lr/unet"] = float(lrs[0])
+            else:
+                if len(lrs) == 2:
+                    if args.loraplus_text_encoder_lr_ratio is not None and args.loraplus_unet_lr_ratio is None:
+                        logs["lr/textencoder"] = float(lrs[0])
+                        logs["lr/textencoder_lora+"] = float(lrs[1])
+                    elif args.loraplus_unet_lr_ratio is not None and args.loraplus_text_encoder_lr_ratio is None:
+                        logs["lr/unet"] = float(lrs[0])
+                        logs["lr/unet_lora+"] = float(lrs[1])
+                    elif args.loraplus_unet_lr_ratio is None and args.loraplus_text_encoder_lr_ratio is None and args.loraplus_lr_ratio is not None:
+                        logs["lr/all"] = float(lrs[0])
+                        logs["lr/all_lora+"] = float(lrs[1])
+                    else:
+                        logs["lr/textencoder"] = float(lrs[0])
+                        logs["lr/unet"] = float(lrs[-1])
+                elif len(lrs) == 4:
+                    logs["lr/textencoder"] = float(lrs[0])
+                    logs["lr/textencoder_lora+"] = float(lrs[1])
+                    logs["lr/unet"] = float(lrs[2])
+                    logs["lr/unet_lora+"] = float(lrs[3])
+                else:
+                    logs["lr/all"] = float(lrs[0])
+
+            if (
+                args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower()
+            ):  # tracking d*lr value of unet.
+                logs["lr/d*lr"] = (
+                    lr_scheduler.optimizers[-1].param_groups[0]["d"] * lr_scheduler.optimizers[-1].param_groups[0]["lr"]
+                )
+
         return logs
 
     def assert_extra_args(self, args, train_dataset_group):
@@ -339,7 +374,7 @@ def train(self, args):
 
         # 後方互換性を確保するよ
         try:
-            trainable_params = network.prepare_optimizer_params(args.text_encoder_lr, args.unet_lr, args.learning_rate, args.loraplus_text_encoder_lr_ratio, args.loraplus_unet_lr_ratio)
+            trainable_params = network.prepare_optimizer_params(args.text_encoder_lr, args.unet_lr, args.learning_rate, args.loraplus_text_encoder_lr_ratio, args.loraplus_unet_lr_ratio, args.loraplus_lr_ratio)
         except TypeError:
             accelerator.print(
                 "Deprecated: use prepare_optimizer_params(text_encoder_lr, unet_lr, learning_rate) instead of prepare_optimizer_params(text_encoder_lr, unet_lr)"
@@ -348,6 +383,11 @@ def train(self, args):
 
         optimizer_name, optimizer_args, optimizer = train_util.get_optimizer(args, trainable_params)
 
+        if args.loraplus_lr_ratio is not None or args.loraplus_text_encoder_lr_ratio is not None or args.loraplus_unet_lr_ratio is not None:
+            assert (
+                (optimizer_name != "Prodigy" and "DAdapt" not in optimizer_name)
+            ), "LoRA+ and Prodigy/DAdaptation is not supported"
+
         # dataloaderを準備する
         # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
         n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers

From 68467bdf4d76ba2c57289209b0ffd6ba599e2080 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 11 Apr 2024 17:33:19 -0400
Subject: [PATCH 080/132] Fix unset or invalid LR from making a param_group

---
 networks/dylora.py  | 4 ++--
 networks/lora.py    | 5 +++--
 networks/lora_fa.py | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/networks/dylora.py b/networks/dylora.py
index dc5c7cb35..0546fc7ae 100644
--- a/networks/dylora.py
+++ b/networks/dylora.py
@@ -412,8 +412,8 @@ def prepare_optimizer_params(
         text_encoder_lr,
         unet_lr,
         default_lr,
-        unet_loraplus_ratio=None,
         text_encoder_loraplus_ratio=None,
+        unet_loraplus_ratio=None,
         loraplus_ratio=None
     ):
         self.requires_grad_(True)
@@ -441,7 +441,7 @@ def assemble_params(loras, lr, ratio):
                     else:
                         param_data["lr"] = lr
 
-                if ("lr" in param_data) and (param_data["lr"] == 0):
+                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
                     continue
 
                 params.append(param_data)
diff --git a/networks/lora.py b/networks/lora.py
index 6cb05bcb0..d74608fea 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -1040,8 +1040,8 @@ def prepare_optimizer_params(
         text_encoder_lr,
         unet_lr,
         default_lr,
-        unet_loraplus_ratio=None,
         text_encoder_loraplus_ratio=None,
+        unet_loraplus_ratio=None,
         loraplus_ratio=None
     ):
         self.requires_grad_(True)
@@ -1069,7 +1069,8 @@ def assemble_params(loras, lr, ratio):
                     else:
                         param_data["lr"] = lr
 
-                if ("lr" in param_data) and (param_data["lr"] == 0):
+                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
+                    print("NO LR skipping!")
                     continue
 
                 params.append(param_data)
diff --git a/networks/lora_fa.py b/networks/lora_fa.py
index 2eff86d6c..9a608118a 100644
--- a/networks/lora_fa.py
+++ b/networks/lora_fa.py
@@ -1038,8 +1038,8 @@ def prepare_optimizer_params(
         text_encoder_lr,
         unet_lr,
         default_lr,
-        unet_loraplus_ratio=None,
         text_encoder_loraplus_ratio=None,
+        unet_loraplus_ratio=None,
         loraplus_ratio=None
     ):
         self.requires_grad_(True)
@@ -1067,7 +1067,7 @@ def assemble_params(loras, lr, ratio):
                     else:
                         param_data["lr"] = lr
 
-                if ("lr" in param_data) and (param_data["lr"] == 0):
+                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
                     continue
 
                 params.append(param_data)

From 4f203ce40d3a4647d52a2570a228e279dd04b321 Mon Sep 17 00:00:00 2001
From: 2kpr <96332338+2kpr@users.noreply.github.com>
Date: Sun, 14 Apr 2024 09:56:58 -0500
Subject: [PATCH 081/132] Fused backward pass

---
 library/adafactor_fused.py | 106 +++++++++++++++++++++++++++++++++++++
 library/train_util.py      |  13 +++++
 sdxl_train.py              |  29 +++++++---
 3 files changed, 142 insertions(+), 6 deletions(-)
 create mode 100644 library/adafactor_fused.py

diff --git a/library/adafactor_fused.py b/library/adafactor_fused.py
new file mode 100644
index 000000000..bdfc32ced
--- /dev/null
+++ b/library/adafactor_fused.py
@@ -0,0 +1,106 @@
+import math
+import torch
+from transformers import Adafactor
+
+@torch.no_grad()
+def adafactor_step_param(self, p, group):
+    if p.grad is None:
+        return
+    grad = p.grad
+    if grad.dtype in {torch.float16, torch.bfloat16}:
+        grad = grad.float()
+    if grad.is_sparse:
+        raise RuntimeError("Adafactor does not support sparse gradients.")
+
+    state = self.state[p]
+    grad_shape = grad.shape
+
+    factored, use_first_moment = Adafactor._get_options(group, grad_shape)
+    # State Initialization
+    if len(state) == 0:
+        state["step"] = 0
+
+        if use_first_moment:
+            # Exponential moving average of gradient values
+            state["exp_avg"] = torch.zeros_like(grad)
+        if factored:
+            state["exp_avg_sq_row"] = torch.zeros(grad_shape[:-1]).to(grad)
+            state["exp_avg_sq_col"] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad)
+        else:
+            state["exp_avg_sq"] = torch.zeros_like(grad)
+
+        state["RMS"] = 0
+    else:
+        if use_first_moment:
+            state["exp_avg"] = state["exp_avg"].to(grad)
+        if factored:
+            state["exp_avg_sq_row"] = state["exp_avg_sq_row"].to(grad)
+            state["exp_avg_sq_col"] = state["exp_avg_sq_col"].to(grad)
+        else:
+            state["exp_avg_sq"] = state["exp_avg_sq"].to(grad)
+
+    p_data_fp32 = p
+    if p.dtype in {torch.float16, torch.bfloat16}:
+        p_data_fp32 = p_data_fp32.float()
+
+    state["step"] += 1
+    state["RMS"] = Adafactor._rms(p_data_fp32)
+    lr = Adafactor._get_lr(group, state)
+
+    beta2t = 1.0 - math.pow(state["step"], group["decay_rate"])
+    update = (grad ** 2) + group["eps"][0]
+    if factored:
+        exp_avg_sq_row = state["exp_avg_sq_row"]
+        exp_avg_sq_col = state["exp_avg_sq_col"]
+
+        exp_avg_sq_row.mul_(beta2t).add_(update.mean(dim=-1), alpha=(1.0 - beta2t))
+        exp_avg_sq_col.mul_(beta2t).add_(update.mean(dim=-2), alpha=(1.0 - beta2t))
+
+        # Approximation of exponential moving average of square of gradient
+        update = Adafactor._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
+        update.mul_(grad)
+    else:
+        exp_avg_sq = state["exp_avg_sq"]
+
+        exp_avg_sq.mul_(beta2t).add_(update, alpha=(1.0 - beta2t))
+        update = exp_avg_sq.rsqrt().mul_(grad)
+
+    update.div_((Adafactor._rms(update) / group["clip_threshold"]).clamp_(min=1.0))
+    update.mul_(lr)
+
+    if use_first_moment:
+        exp_avg = state["exp_avg"]
+        exp_avg.mul_(group["beta1"]).add_(update, alpha=(1 - group["beta1"]))
+        update = exp_avg
+
+    if group["weight_decay"] != 0:
+        p_data_fp32.add_(p_data_fp32, alpha=(-group["weight_decay"] * lr))
+
+    p_data_fp32.add_(-update)
+
+    if p.dtype in {torch.float16, torch.bfloat16}:
+        p.copy_(p_data_fp32)
+
+
+@torch.no_grad()
+def adafactor_step(self, closure=None):
+    """
+    Performs a single optimization step
+
+    Arguments:
+        closure (callable, optional): A closure that reevaluates the model
+            and returns the loss.
+    """
+    loss = None
+    if closure is not None:
+        loss = closure()
+
+    for group in self.param_groups:
+        for p in group["params"]:
+            adafactor_step_param(self, p, group)
+
+    return loss
+
+def patch_adafactor_fused(optimizer: Adafactor):
+    optimizer.step_param = adafactor_step_param.__get__(optimizer)
+    optimizer.step = adafactor_step.__get__(optimizer)
diff --git a/library/train_util.py b/library/train_util.py
index 15c23f3cc..46b55c03e 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2920,6 +2920,11 @@ def add_optimizer_arguments(parser: argparse.ArgumentParser):
         default=1,
         help="Polynomial power for polynomial scheduler / polynomialスケジューラでのpolynomial power",
     )
+    parser.add_argument(
+        "--fused_backward_pass",
+        action="store_true",
+        help="Combines backward pass and optimizer step to reduce VRAM usage / バックワードパスとオプティマイザステップを組み合わせてVRAMの使用量を削減します。",
+    )
 
 
 def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth: bool):
@@ -3846,6 +3851,14 @@ def get_optimizer(args, trainable_params):
         optimizer_type = "AdamW"
     optimizer_type = optimizer_type.lower()
 
+    if args.fused_backward_pass:
+        assert (
+            optimizer_type == "Adafactor".lower()
+        ), "fused_backward_pass currently only works with optimizer_type Adafactor / fused_backward_passは現在optimizer_type Adafactorでのみ機能します"
+        assert (
+            args.gradient_accumulation_steps == 1
+        ), "fused_backward_pass does not work with gradient_accumulation_steps > 1 / fused_backward_passはgradient_accumulation_steps>1では機能しません"
+
     # 引数を分解する
     optimizer_kwargs = {}
     if args.optimizer_args is not None and len(args.optimizer_args) > 0:
diff --git a/sdxl_train.py b/sdxl_train.py
index 46d7860be..3b28575ed 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -430,6 +430,20 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             text_encoder2 = accelerator.prepare(text_encoder2)
         optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
+    if args.fused_backward_pass:
+        import library.adafactor_fused
+        library.adafactor_fused.patch_adafactor_fused(optimizer)
+        for param_group in optimizer.param_groups:
+            for parameter in param_group["params"]:
+                if parameter.requires_grad:
+                    def __grad_hook(tensor: torch.Tensor, param_group=param_group):
+                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                        optimizer.step_param(tensor, param_group)
+                        tensor.grad = None
+
+                    parameter.register_post_accumulate_grad_hook(__grad_hook)
+
     # TextEncoderの出力をキャッシュするときにはCPUへ移動する
     if args.cache_text_encoder_outputs:
         # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
@@ -619,13 +633,16 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                     loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c)
 
                 accelerator.backward(loss)
-                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                    params_to_clip = []
-                    for m in training_models:
-                        params_to_clip.extend(m.parameters())
-                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
-                optimizer.step()
+                if not args.fused_backward_pass:
+                    if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                        params_to_clip = []
+                        for m in training_models:
+                            params_to_clip.extend(m.parameters())
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                    optimizer.step()
+
                 lr_scheduler.step()
                 optimizer.zero_grad(set_to_none=True)
 

From 64916a35b2378c4a8cdf3e9efeef8b8ab7ccb41c Mon Sep 17 00:00:00 2001
From: Zovjsra <4703michael@gmail.com>
Date: Tue, 16 Apr 2024 16:40:08 +0800
Subject: [PATCH 082/132] add disable_mmap to args

---
 library/sdxl_model_util.py | 14 +++++++++-----
 library/sdxl_train_util.py |  9 +++++++--
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/library/sdxl_model_util.py b/library/sdxl_model_util.py
index f03f1bae5..e6fcb1f9c 100644
--- a/library/sdxl_model_util.py
+++ b/library/sdxl_model_util.py
@@ -1,4 +1,5 @@
 import torch
+import safetensors
 from accelerate import init_empty_weights
 from accelerate.utils.modeling import set_module_tensor_to_device
 from safetensors.torch import load_file, save_file
@@ -163,17 +164,20 @@ def _load_state_dict_on_device(model, state_dict, device, dtype=None):
     raise RuntimeError("Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs)))
 
 
-def load_models_from_sdxl_checkpoint(model_version, ckpt_path, map_location, dtype=None):
+def load_models_from_sdxl_checkpoint(model_version, ckpt_path, map_location, dtype=None, disable_mmap=False):
     # model_version is reserved for future use
     # dtype is used for full_fp16/bf16 integration. Text Encoder will remain fp32, because it runs on CPU when caching
 
     # Load the state dict
     if model_util.is_safetensors(ckpt_path):
         checkpoint = None
-        try:
-            state_dict = load_file(ckpt_path, device=map_location)
-        except:
-            state_dict = load_file(ckpt_path)  # prevent device invalid Error
+        if(disable_mmap):
+            state_dict = safetensors.torch.load(open(ckpt_path, 'rb').read())
+        else:
+            try:
+                state_dict = load_file(ckpt_path, device=map_location)
+            except:
+                state_dict = load_file(ckpt_path)  # prevent device invalid Error
         epoch = None
         global_step = None
     else:
diff --git a/library/sdxl_train_util.py b/library/sdxl_train_util.py
index a29013e34..106c5b455 100644
--- a/library/sdxl_train_util.py
+++ b/library/sdxl_train_util.py
@@ -44,6 +44,7 @@ def load_target_model(args, accelerator, model_version: str, weight_dtype):
                 weight_dtype,
                 accelerator.device if args.lowram else "cpu",
                 model_dtype,
+                args.disable_mmap_load_safetensors
             )
 
             # work on low-ram device
@@ -60,7 +61,7 @@ def load_target_model(args, accelerator, model_version: str, weight_dtype):
 
 
 def _load_target_model(
-    name_or_path: str, vae_path: Optional[str], model_version: str, weight_dtype, device="cpu", model_dtype=None
+    name_or_path: str, vae_path: Optional[str], model_version: str, weight_dtype, device="cpu", model_dtype=None, disable_mmap=False
 ):
     # model_dtype only work with full fp16/bf16
     name_or_path = os.readlink(name_or_path) if os.path.islink(name_or_path) else name_or_path
@@ -75,7 +76,7 @@ def _load_target_model(
             unet,
             logit_scale,
             ckpt_info,
-        ) = sdxl_model_util.load_models_from_sdxl_checkpoint(model_version, name_or_path, device, model_dtype)
+        ) = sdxl_model_util.load_models_from_sdxl_checkpoint(model_version, name_or_path, device, model_dtype, disable_mmap)
     else:
         # Diffusers model is loaded to CPU
         from diffusers import StableDiffusionXLPipeline
@@ -332,6 +333,10 @@ def add_sdxl_training_arguments(parser: argparse.ArgumentParser):
         action="store_true",
         help="cache text encoder outputs to disk / text encoderの出力をディスクにキャッシュする",
     )
+    parser.add_argument(
+        "--disable_mmap_load_safetensors",
+        action="store_true",
+    )
 
 
 def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCaching: bool = True):

From feefcf256e78a5f8d60c3a940f2be3b5c3ca335d Mon Sep 17 00:00:00 2001
From: Cauldrath <bnjmnhanes@gmail.com>
Date: Thu, 18 Apr 2024 23:15:36 -0400
Subject: [PATCH 083/132] Display name of error latent file When trying to load
 stored latents, if an error occurs, this change will tell you what file
 failed to load Currently it will just tell you that something failed without
 telling you which file

---
 library/train_util.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 15c23f3cc..58527fa00 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2123,18 +2123,21 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool):
     if not os.path.exists(npz_path):
         return False
 
-    npz = np.load(npz_path)
-    if "latents" not in npz or "original_size" not in npz or "crop_ltrb" not in npz:  # old ver?
-        return False
-    if npz["latents"].shape[1:3] != expected_latents_size:
-        return False
-
-    if flip_aug:
-        if "latents_flipped" not in npz:
+    try:
+        npz = np.load(npz_path)
+        if "latents" not in npz or "original_size" not in npz or "crop_ltrb" not in npz:  # old ver?
             return False
-        if npz["latents_flipped"].shape[1:3] != expected_latents_size:
+        if npz["latents"].shape[1:3] != expected_latents_size:
             return False
 
+        if flip_aug:
+            if "latents_flipped" not in npz:
+                return False
+            if npz["latents_flipped"].shape[1:3] != expected_latents_size:
+                return False
+    except:
+        raise RuntimeError(f"Error loading file: {npz_path}")
+
     return True
 
 

From fc374375de4fc9efd10eb598fdc166a4b6d0ad17 Mon Sep 17 00:00:00 2001
From: Cauldrath <bnjmnhanes@gmail.com>
Date: Thu, 18 Apr 2024 23:29:01 -0400
Subject: [PATCH 084/132] Allow negative learning rate This can be used to
 train away from a group of images you don't want As this moves the model away
 from a point instead of towards it, the change in the model is unbounded So,
 don't set it too low. -4e-7 seemed to work well.

---
 sdxl_train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sdxl_train.py b/sdxl_train.py
index 46d7860be..1e6cec1a4 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -272,7 +272,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     # 学習を準備する：モデルを適切な状態にする
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
-    train_unet = args.learning_rate > 0
+    train_unet = args.learning_rate != 0
     train_text_encoder1 = False
     train_text_encoder2 = False
 
@@ -284,8 +284,8 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             text_encoder2.gradient_checkpointing_enable()
         lr_te1 = args.learning_rate_te1 if args.learning_rate_te1 is not None else args.learning_rate  # 0 means not train
         lr_te2 = args.learning_rate_te2 if args.learning_rate_te2 is not None else args.learning_rate  # 0 means not train
-        train_text_encoder1 = lr_te1 > 0
-        train_text_encoder2 = lr_te2 > 0
+        train_text_encoder1 = lr_te1 != 0
+        train_text_encoder2 = lr_te2 != 0
 
         # caching one text encoder output is not supported
         if not train_text_encoder1:

From 2c9db5d9f2f6b57f15b9312139d0410ae8ae4f3c Mon Sep 17 00:00:00 2001
From: Maatra <ccharest93@hotmail.com>
Date: Sat, 20 Apr 2024 14:11:43 +0100
Subject: [PATCH 085/132] passing filtered hyperparameters to accelerate

---
 fine_tune.py                         |  2 +-
 library/train_util.py                | 14 ++++++++++++++
 sdxl_train.py                        |  2 +-
 sdxl_train_control_net_lllite.py     |  2 +-
 sdxl_train_control_net_lllite_old.py |  2 +-
 train_controlnet.py                  |  2 +-
 train_db.py                          |  2 +-
 train_network.py                     |  2 +-
 train_textual_inversion.py           |  2 +-
 train_textual_inversion_XTI.py       |  2 +-
 10 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index c7e6bbd2e..77a1a4f30 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -310,7 +310,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             init_kwargs["wandb"] = {"name": args.wandb_run_name}
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
-        accelerator.init_trackers("finetuning" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs)
+        accelerator.init_trackers("finetuning" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs)
 
     # For --sample_at_first
     train_util.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
diff --git a/library/train_util.py b/library/train_util.py
index 15c23f3cc..40be2b05b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3378,6 +3378,20 @@ def add_masked_loss_arguments(parser: argparse.ArgumentParser):
         help="apply mask for calculating loss. conditioning_data_dir is required for dataset. / 損失計算時にマスクを適用する。datasetにはconditioning_data_dirが必要",
     )
 
+def filter_sensitive_args(args: argparse.Namespace):
+    sensitive_args = ["wandb_api_key", "huggingface_token"]
+    sensitive_path_args = [
+        "pretrained_model_name_or_path",
+        "vae",
+        "tokenizer_cache_dir",
+        "train_data_dir",
+        "conditioning_data_dir",
+        "reg_data_dir",
+        "output_dir",
+        "logging_dir",
+    ]
+    filtered_args = {k: v for k, v in vars(args).items() if k not in sensitive_args + sensitive_path_args}
+    return filtered_args
 
 # verify command line args for training
 def verify_command_line_training_args(args: argparse.Namespace):
diff --git a/sdxl_train.py b/sdxl_train.py
index 46d7860be..5a9aa214e 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -487,7 +487,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             init_kwargs["wandb"] = {"name": args.wandb_run_name}
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
-        accelerator.init_trackers("finetuning" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs)
+        accelerator.init_trackers("finetuning" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs)
 
     # For --sample_at_first
     sdxl_train_util.sample_images(
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index f89c3628f..770a1f3df 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -353,7 +353,7 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs
+            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
         )
 
     loss_recorder = train_util.LossRecorder()
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index e85e978c1..9490cf6f2 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -324,7 +324,7 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs
+            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
         )
 
     loss_recorder = train_util.LossRecorder()
diff --git a/train_controlnet.py b/train_controlnet.py
index f4c94e8d9..793f79c7d 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -344,7 +344,7 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "controlnet_train" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs
+            "controlnet_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
         )
 
     loss_recorder = train_util.LossRecorder()
diff --git a/train_db.py b/train_db.py
index 1de504ed8..4f9018293 100644
--- a/train_db.py
+++ b/train_db.py
@@ -290,7 +290,7 @@ def train(args):
             init_kwargs["wandb"] = {"name": args.wandb_run_name}
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
-        accelerator.init_trackers("dreambooth" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs)
+        accelerator.init_trackers("dreambooth" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs)
 
     # For --sample_at_first
     train_util.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
diff --git a/train_network.py b/train_network.py
index c99d37247..1dca437cf 100644
--- a/train_network.py
+++ b/train_network.py
@@ -753,7 +753,7 @@ def load_model_hook(models, input_dir):
             if args.log_tracker_config is not None:
                 init_kwargs = toml.load(args.log_tracker_config)
             accelerator.init_trackers(
-                "network_train" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs
+                "network_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
             )
 
         loss_recorder = train_util.LossRecorder()
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 10fce2677..56a387391 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -510,7 +510,7 @@ def train(self, args):
             if args.log_tracker_config is not None:
                 init_kwargs = toml.load(args.log_tracker_config)
             accelerator.init_trackers(
-                "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs
+                "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
             )
 
         # function for saving/removing
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index ddd03d532..691785239 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -407,7 +407,7 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs
+            "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
         )
 
     # function for saving/removing

From 4477116a64bb6c363d0fd9fbf3e21bb813548dfe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <qinglongshengzhe@gmail.com>
Date: Sat, 20 Apr 2024 21:26:09 +0800
Subject: [PATCH 086/132] fix train controlnet

---
 library/train_util.py | 4 ++--
 requirements.txt      | 1 +
 train_controlnet.py   | 8 ++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 15c23f3cc..ecf3345fb 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1982,8 +1982,8 @@ def make_buckets(self):
         self.bucket_manager = self.dreambooth_dataset_delegate.bucket_manager
         self.buckets_indices = self.dreambooth_dataset_delegate.buckets_indices
 
-    def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True):
-        return self.dreambooth_dataset_delegate.cache_latents(vae, vae_batch_size, cache_to_disk, is_main_process)
+    def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True, cache_file_suffix=".npz", divisor=8):
+        return self.dreambooth_dataset_delegate.cache_latents(vae, vae_batch_size, cache_to_disk, is_main_process, cache_file_suffix, divisor)
 
     def __len__(self):
         return self.dreambooth_dataset_delegate.__len__()
diff --git a/requirements.txt b/requirements.txt
index e99775b8a..9495dab2a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,6 +17,7 @@ easygui==0.98.3
 toml==0.10.2
 voluptuous==0.13.1
 huggingface-hub==0.20.1
+omegaconf==2.3.0
 # for Image utils
 imagesize==1.4.1
 # for BLIP captioning
diff --git a/train_controlnet.py b/train_controlnet.py
index f4c94e8d9..763041aa6 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -5,7 +5,7 @@
 import random
 import time
 from multiprocessing import Value
-from types import SimpleNamespace
+from omegaconf import OmegaConf
 import toml
 
 from tqdm import tqdm
@@ -148,8 +148,10 @@ def train(args):
             "in_channels": 4,
             "layers_per_block": 2,
             "mid_block_scale_factor": 1,
+            "mid_block_type": "UNetMidBlock2DCrossAttn",
             "norm_eps": 1e-05,
             "norm_num_groups": 32,
+            "num_attention_heads": [5, 10, 20, 20],
             "num_class_embeds": None,
             "only_cross_attention": False,
             "out_channels": 4,
@@ -179,8 +181,10 @@ def train(args):
             "in_channels": 4,
             "layers_per_block": 2,
             "mid_block_scale_factor": 1,
+            "mid_block_type": "UNetMidBlock2DCrossAttn",
             "norm_eps": 1e-05,
             "norm_num_groups": 32,
+            "num_attention_heads": 8,
             "out_channels": 4,
             "sample_size": 64,
             "up_block_types": ["UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"],
@@ -193,7 +197,7 @@ def train(args):
             "resnet_time_scale_shift": "default",
             "projection_class_embeddings_input_dim": None,
         }
-    unet.config = SimpleNamespace(**unet.config)
+    unet.config = OmegaConf.create(unet.config)
 
     controlnet = ControlNetModel.from_unet(unet)
 

From b886d0a359526f5715f3ced05697d406a169055b Mon Sep 17 00:00:00 2001
From: Maatra <ccharest93@hotmail.com>
Date: Sat, 20 Apr 2024 14:36:47 +0100
Subject: [PATCH 087/132] Cleaned typing to be in line with accelerate
 hyperparameters type resctrictions

---
 library/train_util.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 40be2b05b..75b3420d9 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3390,7 +3390,20 @@ def filter_sensitive_args(args: argparse.Namespace):
         "output_dir",
         "logging_dir",
     ]
-    filtered_args = {k: v for k, v in vars(args).items() if k not in sensitive_args + sensitive_path_args}
+    filtered_args = {}
+    for k, v in vars(args).items():
+    # filter out sensitive values
+        if k not in sensitive_args + sensitive_path_args:
+            #Accelerate values need to have type `bool`,`str`, `float`, `int`, or `None`.
+            if v is None or isinstance(v, bool) or isinstance(v, str) or isinstance(v, float) or isinstance(v, int):
+                filtered_args[k] = v
+            # accelerate does not support lists
+            elif isinstance(v, list):
+                filtered_args[k] = f"{v}"
+            # accelerate does not support objects
+            elif isinstance(v, object):
+                filtered_args[k] = f"{v}"
+
     return filtered_args
 
 # verify command line args for training

From 5cb145d13bd9fae307a8766f4088b95f01492580 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <qinglongshengzhe@gmail.com>
Date: Sat, 20 Apr 2024 21:56:24 +0800
Subject: [PATCH 088/132] Update train_util.py

---
 library/train_util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index ecf3345fb..15c23f3cc 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1982,8 +1982,8 @@ def make_buckets(self):
         self.bucket_manager = self.dreambooth_dataset_delegate.bucket_manager
         self.buckets_indices = self.dreambooth_dataset_delegate.buckets_indices
 
-    def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True, cache_file_suffix=".npz", divisor=8):
-        return self.dreambooth_dataset_delegate.cache_latents(vae, vae_batch_size, cache_to_disk, is_main_process, cache_file_suffix, divisor)
+    def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_process=True):
+        return self.dreambooth_dataset_delegate.cache_latents(vae, vae_batch_size, cache_to_disk, is_main_process)
 
     def __len__(self):
         return self.dreambooth_dataset_delegate.__len__()

From 52652cba1a419cd72851c3882f1f877670d889c5 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 21 Apr 2024 17:41:32 +0900
Subject: [PATCH 089/132] disable main process check for deepspeed #1247

---
 train_network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index c99d37247..3a5255160 100644
--- a/train_network.py
+++ b/train_network.py
@@ -474,7 +474,8 @@ def train(self, args):
         # before resuming make hook for saving/loading to save/load the network weights only
         def save_model_hook(models, weights, output_dir):
             # pop weights of other models than network to save only network weights
-            if accelerator.is_main_process:
+            # only main process or deepspeed https://github.com/huggingface/diffusers/issues/2606
+            if accelerator.is_main_process or args.deepspeed:
                 remove_indices = []
                 for i, model in enumerate(models):
                     if not isinstance(model, type(accelerator.unwrap_model(network))):

From 0540c33acac223b672da05e40edcfb3b6a35c0da Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 21 Apr 2024 17:45:29 +0900
Subject: [PATCH 090/132] pop weights if available #1247

---
 train_network.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 3a5255160..aad5a7194 100644
--- a/train_network.py
+++ b/train_network.py
@@ -481,7 +481,8 @@ def save_model_hook(models, weights, output_dir):
                     if not isinstance(model, type(accelerator.unwrap_model(network))):
                         remove_indices.append(i)
                 for i in reversed(remove_indices):
-                    weights.pop(i)
+                    if len(weights) > i:
+                        weights.pop(i)
                 # print(f"save model hook: {len(weights)} weights will be saved")
 
         def load_model_hook(models, input_dir):

From 040e26ff1d8f855f52cdfb62781e06284c5e9e34 Mon Sep 17 00:00:00 2001
From: Cauldrath <bnjmnhanes@gmail.com>
Date: Sun, 21 Apr 2024 13:46:31 -0400
Subject: [PATCH 091/132] Regenerate failed file If a latent file fails to
 load, print out the path and the error, then return false to regenerate it

---
 library/train_util.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 58527fa00..4168a41fb 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2135,8 +2135,10 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool):
                 return False
             if npz["latents_flipped"].shape[1:3] != expected_latents_size:
                 return False
-    except:
-        raise RuntimeError(f"Error loading file: {npz_path}")
+    except Exception as e:
+        print(npz_path)
+        print(e)
+        return False
 
     return True
 

From fdbb03c360777562e91ab1884ed7cf2c3d65611b Mon Sep 17 00:00:00 2001
From: frodo821 <sakaic2003@gmail.com>
Date: Tue, 23 Apr 2024 14:29:05 +0900
Subject: [PATCH 092/132] removed unnecessary `torch` import on line 115

as per #1290
---
 finetune/tag_images_by_wd14_tagger.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index a327bbd61..b3f9cdd26 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -112,7 +112,6 @@ def main(args):
 
     # モデルを読み込む
     if args.onnx:
-        import torch
         import onnx
         import onnxruntime as ort
 

From 969f82ab474024865d292afd96768e817c9374c1 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 29 Apr 2024 20:04:25 +0900
Subject: [PATCH 093/132] move loraplus args from args to network_args,
 simplify log lr desc

---
 library/train_util.py |   3 --
 networks/lora.py      |  58 ++++++++++++++-------
 train_network.py      | 114 ++++++++++++++++--------------------------
 3 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 048ed2ce3..15c23f3cc 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2920,9 +2920,6 @@ def add_optimizer_arguments(parser: argparse.ArgumentParser):
         default=1,
         help="Polynomial power for polynomial scheduler / polynomialスケジューラでのpolynomial power",
     )
-    parser.add_argument("--loraplus_lr_ratio", default=None, type=float, help="LoRA+ learning rate ratio")
-    parser.add_argument("--loraplus_unet_lr_ratio", default=None, type=float, help="LoRA+ UNet learning rate ratio")
-    parser.add_argument("--loraplus_text_encoder_lr_ratio", default=None, type=float, help="LoRA+ text encoder learning rate ratio")
 
 
 def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth: bool):
diff --git a/networks/lora.py b/networks/lora.py
index edbbdc0d8..b67c59bd5 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -490,6 +490,14 @@ def create_network(
         varbose=True,
     )
 
+    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
+    loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
+    loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
+    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
+    loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
+    loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
+    network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
+
     if up_lr_weight is not None or mid_lr_weight is not None or down_lr_weight is not None:
         network.set_block_lr_weight(up_lr_weight, mid_lr_weight, down_lr_weight)
 
@@ -1033,18 +1041,27 @@ def get_lr_weight(self, lora: LoRAModule) -> float:
 
         return lr_weight
 
+    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
+        self.loraplus_lr_ratio = loraplus_lr_ratio
+        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
+        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
+
     # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
-    def prepare_optimizer_params(
-        self,
-        text_encoder_lr,
-        unet_lr,
-        default_lr,
-        text_encoder_loraplus_ratio=None,
-        unet_loraplus_ratio=None,
-        loraplus_ratio=None
-    ):
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
+        # TODO warn if optimizer is not compatible with LoRA+ (but it will cause error so we don't need to check it here?)
+        # if (
+        #     self.loraplus_lr_ratio is not None
+        #     or self.loraplus_text_encoder_lr_ratio is not None
+        #     or self.loraplus_unet_lr_ratio is not None
+        # ):
+        #     assert (
+        #         optimizer_type.lower() != "prodigy" and "dadapt" not in optimizer_type.lower()
+        #     ), "LoRA+ and Prodigy/DAdaptation is not supported / LoRA+とProdigy/DAdaptationの組み合わせはサポートされていません"
+
         self.requires_grad_(True)
+
         all_params = []
+        lr_descriptions = []
 
         def assemble_params(loras, lr, ratio):
             param_groups = {"lora": {}, "plus": {}}
@@ -1056,6 +1073,7 @@ def assemble_params(loras, lr, ratio):
                         param_groups["lora"][f"{lora.lora_name}.{name}"] = param
 
             params = []
+            descriptions = []
             for key in param_groups.keys():
                 param_data = {"params": param_groups[key].values()}
 
@@ -1069,20 +1087,22 @@ def assemble_params(loras, lr, ratio):
                         param_data["lr"] = lr
 
                 if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
-                    print("NO LR skipping!")
+                    logger.info("NO LR skipping!")
                     continue
 
                 params.append(param_data)
+                descriptions.append("plus" if key == "plus" else "")
 
-            return params
+            return params, descriptions
 
         if self.text_encoder_loras:
-            params = assemble_params(
+            params, descriptions = assemble_params(
                 self.text_encoder_loras,
                 text_encoder_lr if text_encoder_lr is not None else default_lr,
-                text_encoder_loraplus_ratio or loraplus_ratio
+                self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio,
             )
             all_params.extend(params)
+            lr_descriptions.extend(["textencoder" + (" " + d if d else "") for d in descriptions])
 
         if self.unet_loras:
             if self.block_lr:
@@ -1096,22 +1116,24 @@ def assemble_params(loras, lr, ratio):
 
                 # blockごとにパラメータを設定する
                 for idx, block_loras in block_idx_to_lora.items():
-                    params = assemble_params(
+                    params, descriptions = assemble_params(
                         block_loras,
                         (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(block_loras[0]),
-                        unet_loraplus_ratio or loraplus_ratio
+                        self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
                     )
                     all_params.extend(params)
+                    lr_descriptions.extend([f"unet_block{idx}" + (" " + d if d else "") for d in descriptions])
 
             else:
-                params = assemble_params(
+                params, descriptions = assemble_params(
                     self.unet_loras,
                     unet_lr if unet_lr is not None else default_lr,
-                    unet_loraplus_ratio or loraplus_ratio
+                    self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
                 )
                 all_params.extend(params)
+                lr_descriptions.extend(["unet" + (" " + d if d else "") for d in descriptions])
 
-        return all_params
+        return all_params, lr_descriptions
 
     def enable_gradient_checkpointing(self):
         # not supported
diff --git a/train_network.py b/train_network.py
index 9670490ae..c43241e8d 100644
--- a/train_network.py
+++ b/train_network.py
@@ -53,7 +53,15 @@ def __init__(self):
 
     # TODO 他のスクリプトと共通化する
     def generate_step_logs(
-        self, args: argparse.Namespace, current_loss, avr_loss, lr_scheduler, keys_scaled=None, mean_norm=None, maximum_norm=None
+        self,
+        args: argparse.Namespace,
+        current_loss,
+        avr_loss,
+        lr_scheduler,
+        lr_descriptions,
+        keys_scaled=None,
+        mean_norm=None,
+        maximum_norm=None,
     ):
         logs = {"loss/current": current_loss, "loss/average": avr_loss}
 
@@ -63,68 +71,25 @@ def generate_step_logs(
             logs["max_norm/max_key_norm"] = maximum_norm
 
         lrs = lr_scheduler.get_last_lr()
-
-        if len(lrs) > 4:
-            idx = 0
-            if not args.network_train_unet_only:
-                logs["lr/textencoder"] = float(lrs[0])
-                idx = 1
-
-            for i in range(idx, len(lrs)):
-                lora_plus = ""
-                group_id = i
-
-                if args.loraplus_lr_ratio is not None or args.loraplus_unet_lr_ratio is not None:
-                    lora_plus = '_lora+' if i % 2 == 1 else '' 
-                    group_id = int((i / 2) + (i % 2 + 0.5))
-
-                logs[f"lr/group{group_id}{lora_plus}"] = float(lrs[i])
-                if args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower():
-                    logs[f"lr/d*lr/group{group_id}{lora_plus}"] = (
-                        lr_scheduler.optimizers[-1].param_groups[i]["d"] * lr_scheduler.optimizers[-1].param_groups[i]["lr"]
-                    )
-
-        else:
-            if args.network_train_text_encoder_only:
-                if args.loraplus_lr_ratio is not None or  args.loraplus_text_encoder_lr_ratio is not None:
-                    logs["lr/textencoder"] = float(lrs[0])
-                    logs["lr/textencoder_lora+"] = float(lrs[1])
-                else:
-                    logs["lr/textencoder"] = float(lrs[0])
-                
-            elif args.network_train_unet_only:
-                if args.loraplus_lr_ratio is not None or args.loraplus_unet_lr_ratio is not None:
-                    logs["lr/unet"] = float(lrs[0])
-                    logs["lr/unet_lora+"] = float(lrs[1])
-                else:
-                    logs["lr/unet"] = float(lrs[0])
+        for i, lr in enumerate(lrs):
+            if lr_descriptions is not None:
+                lr_desc = lr_descriptions[i]
             else:
-                if len(lrs) == 2:
-                    if args.loraplus_text_encoder_lr_ratio is not None and args.loraplus_unet_lr_ratio is None:
-                        logs["lr/textencoder"] = float(lrs[0])
-                        logs["lr/textencoder_lora+"] = float(lrs[1])
-                    elif args.loraplus_unet_lr_ratio is not None and args.loraplus_text_encoder_lr_ratio is None:
-                        logs["lr/unet"] = float(lrs[0])
-                        logs["lr/unet_lora+"] = float(lrs[1])
-                    elif args.loraplus_unet_lr_ratio is None and args.loraplus_text_encoder_lr_ratio is None and args.loraplus_lr_ratio is not None:
-                        logs["lr/all"] = float(lrs[0])
-                        logs["lr/all_lora+"] = float(lrs[1])
-                    else:
-                        logs["lr/textencoder"] = float(lrs[0])
-                        logs["lr/unet"] = float(lrs[-1])
-                elif len(lrs) == 4:
-                    logs["lr/textencoder"] = float(lrs[0])
-                    logs["lr/textencoder_lora+"] = float(lrs[1])
-                    logs["lr/unet"] = float(lrs[2])
-                    logs["lr/unet_lora+"] = float(lrs[3])
+                idx = i - (0 if args.network_train_unet_only else -1)
+                if idx == -1:
+                    lr_desc = "textencoder"
                 else:
-                    logs["lr/all"] = float(lrs[0])
+                    if len(lrs) > 2:
+                        lr_desc = f"group{idx}"
+                    else:
+                        lr_desc = "unet"
+
+            logs[f"lr/{lr_desc}"] = lr
 
-            if (
-                args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower()
-            ):  # tracking d*lr value of unet.
-                logs["lr/d*lr"] = (
-                    lr_scheduler.optimizers[-1].param_groups[0]["d"] * lr_scheduler.optimizers[-1].param_groups[0]["lr"]
+            if args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower():
+                # tracking d*lr value
+                logs[f"lr/d*lr/{lr_desc}"] = (
+                    lr_scheduler.optimizers[-1].param_groups[i]["d"] * lr_scheduler.optimizers[-1].param_groups[i]["lr"]
                 )
 
         return logs
@@ -358,6 +323,7 @@ def train(self, args):
         network.apply_to(text_encoder, unet, train_text_encoder, train_unet)
 
         if args.network_weights is not None:
+            # FIXME consider alpha of weights
             info = network.load_weights(args.network_weights)
             accelerator.print(f"load network weights from {args.network_weights}: {info}")
 
@@ -373,20 +339,23 @@ def train(self, args):
 
         # 後方互換性を確保するよ
         try:
-            trainable_params = network.prepare_optimizer_params(args.text_encoder_lr, args.unet_lr, args.learning_rate, args.loraplus_text_encoder_lr_ratio, args.loraplus_unet_lr_ratio, args.loraplus_lr_ratio)
+            results = network.prepare_optimizer_params(args.text_encoder_lr, args.unet_lr, args.learning_rate)
+            if type(results) is tuple:
+                trainable_params = results[0]
+                lr_descriptions = results[1]
+            else:
+                trainable_params = results
+                lr_descriptions = None
         except TypeError:
-            accelerator.print(
-                "Deprecated: use prepare_optimizer_params(text_encoder_lr, unet_lr, learning_rate) instead of prepare_optimizer_params(text_encoder_lr, unet_lr)"
-            )
+            # accelerator.print(
+            #     "Deprecated: use prepare_optimizer_params(text_encoder_lr, unet_lr, learning_rate) instead of prepare_optimizer_params(text_encoder_lr, unet_lr)"
+            # )
             trainable_params = network.prepare_optimizer_params(args.text_encoder_lr, args.unet_lr)
+            lr_descriptions = None
+        print(lr_descriptions)
 
         optimizer_name, optimizer_args, optimizer = train_util.get_optimizer(args, trainable_params)
 
-        if args.loraplus_lr_ratio is not None or args.loraplus_text_encoder_lr_ratio is not None or args.loraplus_unet_lr_ratio is not None:
-            assert (
-                (optimizer_name != "Prodigy" and "DAdapt" not in optimizer_name)
-            ), "LoRA+ and Prodigy/DAdaptation is not supported"
-
         # dataloaderを準備する
         # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
         n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
@@ -992,7 +961,9 @@ def remove_model(old_ckpt_name):
                     progress_bar.set_postfix(**{**max_mean_logs, **logs})
 
                 if args.logging_dir is not None:
-                    logs = self.generate_step_logs(args, current_loss, avr_loss, lr_scheduler, keys_scaled, mean_norm, maximum_norm)
+                    logs = self.generate_step_logs(
+                        args, current_loss, avr_loss, lr_scheduler, lr_descriptions, keys_scaled, mean_norm, maximum_norm
+                    )
                     accelerator.log(logs, step=global_step)
 
                 if global_step >= args.max_train_steps:
@@ -1143,6 +1114,9 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
     )
+    # parser.add_argument("--loraplus_lr_ratio", default=None, type=float, help="LoRA+ learning rate ratio")
+    # parser.add_argument("--loraplus_unet_lr_ratio", default=None, type=float, help="LoRA+ UNet learning rate ratio")
+    # parser.add_argument("--loraplus_text_encoder_lr_ratio", default=None, type=float, help="LoRA+ text encoder learning rate ratio")
     return parser
 
 

From dbb7bb288e416dae56d2911077e2642ad0f4b20d Mon Sep 17 00:00:00 2001
From: Dave Lage <rockerboo@gmail.com>
Date: Thu, 2 May 2024 17:39:35 -0400
Subject: [PATCH 094/132] Fix caption_separator missing in subset schema

---
 library/config_util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/library/config_util.py b/library/config_util.py
index d75d03b03..0276acb1e 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -191,6 +191,7 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "keep_tokens": int,
         "keep_tokens_separator": str,
         "secondary_separator": str,
+        "caption_separator": str,
         "enable_wildcard": bool,
         "token_warmup_min": int,
         "token_warmup_step": Any(float, int),

From 8db0cadcee47005feef5be34cbfaac8b85fe8837 Mon Sep 17 00:00:00 2001
From: Dave Lage <rockerboo@gmail.com>
Date: Thu, 2 May 2024 18:08:28 -0400
Subject: [PATCH 095/132] Add caption_separator to output for subset

---
 library/config_util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/library/config_util.py b/library/config_util.py
index d75d03b03..97554bbef 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -523,6 +523,7 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
           shuffle_caption: {subset.shuffle_caption}
           keep_tokens: {subset.keep_tokens}
           keep_tokens_separator: {subset.keep_tokens_separator}
+          caption_separator: {subset.caption_separator}
           secondary_separator: {subset.secondary_separator}
           enable_wildcard: {subset.enable_wildcard}
           caption_dropout_rate: {subset.caption_dropout_rate}

From 58c2d856ae6da6d6962cbfdd98c8a93eb790cbde Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 3 May 2024 22:18:20 +0900
Subject: [PATCH 096/132] support block dim/lr for sdxl

---
 networks/lora.py | 275 +++++++++++++++++++++++++++--------------------
 train_network.py |   4 +-
 2 files changed, 158 insertions(+), 121 deletions(-)

diff --git a/networks/lora.py b/networks/lora.py
index b67c59bd5..61b8cd5a7 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -12,6 +12,7 @@
 import torch
 import re
 from library.utils import setup_logging
+from library.sdxl_original_unet import SdxlUNet2DConditionModel
 
 setup_logging()
 import logging
@@ -385,14 +386,14 @@ def to_out_forward(self, x):
         return out
 
 
-def parse_block_lr_kwargs(nw_kwargs):
+def parse_block_lr_kwargs(is_sdxl: bool, nw_kwargs: Dict) -> Optional[List[float]]:
     down_lr_weight = nw_kwargs.get("down_lr_weight", None)
     mid_lr_weight = nw_kwargs.get("mid_lr_weight", None)
     up_lr_weight = nw_kwargs.get("up_lr_weight", None)
 
     # 以上のいずれにも設定がない場合は無効としてNoneを返す
     if down_lr_weight is None and mid_lr_weight is None and up_lr_weight is None:
-        return None, None, None
+        return None
 
     # extract learning rate weight for each block
     if down_lr_weight is not None:
@@ -401,18 +402,16 @@ def parse_block_lr_kwargs(nw_kwargs):
             down_lr_weight = [(float(s) if s else 0.0) for s in down_lr_weight.split(",")]
 
     if mid_lr_weight is not None:
-        mid_lr_weight = float(mid_lr_weight)
+        mid_lr_weight = [(float(s) if s else 0.0) for s in mid_lr_weight.split(",")]
 
     if up_lr_weight is not None:
         if "," in up_lr_weight:
             up_lr_weight = [(float(s) if s else 0.0) for s in up_lr_weight.split(",")]
 
-    down_lr_weight, mid_lr_weight, up_lr_weight = get_block_lr_weight(
-        down_lr_weight, mid_lr_weight, up_lr_weight, float(nw_kwargs.get("block_lr_zero_threshold", 0.0))
+    return get_block_lr_weight(
+        is_sdxl, down_lr_weight, mid_lr_weight, up_lr_weight, float(nw_kwargs.get("block_lr_zero_threshold", 0.0))
     )
 
-    return down_lr_weight, mid_lr_weight, up_lr_weight
-
 
 def create_network(
     multiplier: float,
@@ -424,6 +423,9 @@ def create_network(
     neuron_dropout: Optional[float] = None,
     **kwargs,
 ):
+    # if unet is an instance of SdxlUNet2DConditionModel or subclass, set is_sdxl to True
+    is_sdxl = unet is not None and issubclass(unet.__class__, SdxlUNet2DConditionModel)
+
     if network_dim is None:
         network_dim = 4  # default
     if network_alpha is None:
@@ -441,21 +443,21 @@ def create_network(
 
     # block dim/alpha/lr
     block_dims = kwargs.get("block_dims", None)
-    down_lr_weight, mid_lr_weight, up_lr_weight = parse_block_lr_kwargs(kwargs)
+    block_lr_weight = parse_block_lr_kwargs(is_sdxl, kwargs)
 
     # 以上のいずれかに指定があればblockごとのdim(rank)を有効にする
-    if block_dims is not None or down_lr_weight is not None or mid_lr_weight is not None or up_lr_weight is not None:
+    if block_dims is not None or block_lr_weight is not None:
         block_alphas = kwargs.get("block_alphas", None)
         conv_block_dims = kwargs.get("conv_block_dims", None)
         conv_block_alphas = kwargs.get("conv_block_alphas", None)
 
         block_dims, block_alphas, conv_block_dims, conv_block_alphas = get_block_dims_and_alphas(
-            block_dims, block_alphas, network_dim, network_alpha, conv_block_dims, conv_block_alphas, conv_dim, conv_alpha
+            is_sdxl, block_dims, block_alphas, network_dim, network_alpha, conv_block_dims, conv_block_alphas, conv_dim, conv_alpha
         )
 
         # remove block dim/alpha without learning rate
         block_dims, block_alphas, conv_block_dims, conv_block_alphas = remove_block_dims_and_alphas(
-            block_dims, block_alphas, conv_block_dims, conv_block_alphas, down_lr_weight, mid_lr_weight, up_lr_weight
+            is_sdxl, block_dims, block_alphas, conv_block_dims, conv_block_alphas, block_lr_weight
         )
 
     else:
@@ -488,6 +490,7 @@ def create_network(
         conv_block_dims=conv_block_dims,
         conv_block_alphas=conv_block_alphas,
         varbose=True,
+        is_sdxl=is_sdxl,
     )
 
     loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
@@ -498,8 +501,8 @@ def create_network(
     loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
     network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
 
-    if up_lr_weight is not None or mid_lr_weight is not None or down_lr_weight is not None:
-        network.set_block_lr_weight(up_lr_weight, mid_lr_weight, down_lr_weight)
+    if block_lr_weight is not None:
+        network.set_block_lr_weight(block_lr_weight)
 
     return network
 
@@ -509,9 +512,13 @@ def create_network(
 # block_dims, block_alphas は両方ともNoneまたは両方とも値が入っている
 # conv_dim, conv_alpha は両方ともNoneまたは両方とも値が入っている
 def get_block_dims_and_alphas(
-    block_dims, block_alphas, network_dim, network_alpha, conv_block_dims, conv_block_alphas, conv_dim, conv_alpha
+    is_sdxl, block_dims, block_alphas, network_dim, network_alpha, conv_block_dims, conv_block_alphas, conv_dim, conv_alpha
 ):
-    num_total_blocks = LoRANetwork.NUM_OF_BLOCKS * 2 + 1
+    if not is_sdxl:
+        num_total_blocks = LoRANetwork.NUM_OF_BLOCKS * 2 + LoRANetwork.NUM_OF_MID_BLOCKS
+    else:
+        # 1+9+3+9+1=23, no LoRA for emb_layers (0)
+        num_total_blocks = 1 + LoRANetwork.SDXL_NUM_OF_BLOCKS * 2 + LoRANetwork.SDXL_NUM_OF_MID_BLOCKS + 1
 
     def parse_ints(s):
         return [int(i) for i in s.split(",")]
@@ -522,9 +529,10 @@ def parse_floats(s):
     # block_dimsとblock_alphasをパースする。必ず値が入る
     if block_dims is not None:
         block_dims = parse_ints(block_dims)
-        assert (
-            len(block_dims) == num_total_blocks
-        ), f"block_dims must have {num_total_blocks} elements / block_dimsは{num_total_blocks}個指定してください"
+        assert len(block_dims) == num_total_blocks, (
+            f"block_dims must have {num_total_blocks} elements but {len(block_dims)} elements are given"
+            + f" / block_dimsは{num_total_blocks}個指定してください（指定された個数: {len(block_dims)}）"
+        )
     else:
         logger.warning(
             f"block_dims is not specified. all dims are set to {network_dim} / block_dimsが指定されていません。すべてのdimは{network_dim}になります"
@@ -575,15 +583,25 @@ def parse_floats(s):
     return block_dims, block_alphas, conv_block_dims, conv_block_alphas
 
 
-# 層別学習率用に層ごとの学習率に対する倍率を定義する、外部から呼び出される可能性を考慮しておく
+# 層別学習率用に層ごとの学習率に対する倍率を定義する、外部から呼び出せるようにclass外に出しておく
+# 戻り値は block ごとの倍率のリスト
 def get_block_lr_weight(
-    down_lr_weight, mid_lr_weight, up_lr_weight, zero_threshold
-) -> Tuple[List[float], List[float], List[float]]:
+    is_sdxl,
+    down_lr_weight: Union[str, List[float]],
+    mid_lr_weight: List[float],
+    up_lr_weight: Union[str, List[float]],
+    zero_threshold: float,
+) -> Optional[List[float]]:
     # パラメータ未指定時は何もせず、今までと同じ動作とする
     if up_lr_weight is None and mid_lr_weight is None and down_lr_weight is None:
-        return None, None, None
+        return None
 
-    max_len = LoRANetwork.NUM_OF_BLOCKS  # フルモデル相当でのup,downの層の数
+    if not is_sdxl:
+        max_len_for_down_or_up = LoRANetwork.NUM_OF_BLOCKS
+        max_len_for_mid = LoRANetwork.NUM_OF_MID_BLOCKS
+    else:
+        max_len_for_down_or_up = LoRANetwork.SDXL_NUM_OF_BLOCKS
+        max_len_for_mid = LoRANetwork.SDXL_NUM_OF_MID_BLOCKS
 
     def get_list(name_with_suffix) -> List[float]:
         import math
@@ -593,15 +611,18 @@ def get_list(name_with_suffix) -> List[float]:
         base_lr = float(tokens[1]) if len(tokens) > 1 else 0.0
 
         if name == "cosine":
-            return [math.sin(math.pi * (i / (max_len - 1)) / 2) + base_lr for i in reversed(range(max_len))]
+            return [
+                math.sin(math.pi * (i / (max_len_for_down_or_up - 1)) / 2) + base_lr
+                for i in reversed(range(max_len_for_down_or_up))
+            ]
         elif name == "sine":
-            return [math.sin(math.pi * (i / (max_len - 1)) / 2) + base_lr for i in range(max_len)]
+            return [math.sin(math.pi * (i / (max_len_for_down_or_up - 1)) / 2) + base_lr for i in range(max_len_for_down_or_up)]
         elif name == "linear":
-            return [i / (max_len - 1) + base_lr for i in range(max_len)]
+            return [i / (max_len_for_down_or_up - 1) + base_lr for i in range(max_len_for_down_or_up)]
         elif name == "reverse_linear":
-            return [i / (max_len - 1) + base_lr for i in reversed(range(max_len))]
+            return [i / (max_len_for_down_or_up - 1) + base_lr for i in reversed(range(max_len_for_down_or_up))]
         elif name == "zeros":
-            return [0.0 + base_lr] * max_len
+            return [0.0 + base_lr] * max_len_for_down_or_up
         else:
             logger.error(
                 "Unknown lr_weight argument %s is used. Valid arguments:  / 不明なlr_weightの引数 %s が使われました。有効な引数:\n\tcosine, sine, linear, reverse_linear, zeros"
@@ -614,20 +635,36 @@ def get_list(name_with_suffix) -> List[float]:
     if type(up_lr_weight) == str:
         up_lr_weight = get_list(up_lr_weight)
 
-    if (up_lr_weight != None and len(up_lr_weight) > max_len) or (down_lr_weight != None and len(down_lr_weight) > max_len):
-        logger.warning("down_weight or up_weight is too long. Parameters after %d-th are ignored." % max_len)
-        logger.warning("down_weightもしくはup_weightが長すぎます。%d個目以降のパラメータは無視されます。" % max_len)
-        up_lr_weight = up_lr_weight[:max_len]
-        down_lr_weight = down_lr_weight[:max_len]
+    if (up_lr_weight != None and len(up_lr_weight) > max_len_for_down_or_up) or (
+        down_lr_weight != None and len(down_lr_weight) > max_len_for_down_or_up
+    ):
+        logger.warning("down_weight or up_weight is too long. Parameters after %d-th are ignored." % max_len_for_down_or_up)
+        logger.warning("down_weightもしくはup_weightが長すぎます。%d個目以降のパラメータは無視されます。" % max_len_for_down_or_up)
+        up_lr_weight = up_lr_weight[:max_len_for_down_or_up]
+        down_lr_weight = down_lr_weight[:max_len_for_down_or_up]
+
+    if mid_lr_weight != None and len(mid_lr_weight) > max_len_for_mid:
+        logger.warning("mid_weight is too long. Parameters after %d-th are ignored." % max_len_for_mid)
+        logger.warning("mid_weightが長すぎます。%d個目以降のパラメータは無視されます。" % max_len_for_mid)
+        mid_lr_weight = mid_lr_weight[:max_len_for_mid]
+
+    if (up_lr_weight != None and len(up_lr_weight) < max_len_for_down_or_up) or (
+        down_lr_weight != None and len(down_lr_weight) < max_len_for_down_or_up
+    ):
+        logger.warning("down_weight or up_weight is too short. Parameters after %d-th are filled with 1." % max_len_for_down_or_up)
+        logger.warning(
+            "down_weightもしくはup_weightが短すぎます。%d個目までの不足したパラメータは1で補われます。" % max_len_for_down_or_up
+        )
 
-    if (up_lr_weight != None and len(up_lr_weight) < max_len) or (down_lr_weight != None and len(down_lr_weight) < max_len):
-        logger.warning("down_weight or up_weight is too short. Parameters after %d-th are filled with 1." % max_len)
-        logger.warning("down_weightもしくはup_weightが短すぎます。%d個目までの不足したパラメータは1で補われます。" % max_len)
+        if down_lr_weight != None and len(down_lr_weight) < max_len_for_down_or_up:
+            down_lr_weight = down_lr_weight + [1.0] * (max_len_for_down_or_up - len(down_lr_weight))
+        if up_lr_weight != None and len(up_lr_weight) < max_len_for_down_or_up:
+            up_lr_weight = up_lr_weight + [1.0] * (max_len_for_down_or_up - len(up_lr_weight))
 
-        if down_lr_weight != None and len(down_lr_weight) < max_len:
-            down_lr_weight = down_lr_weight + [1.0] * (max_len - len(down_lr_weight))
-        if up_lr_weight != None and len(up_lr_weight) < max_len:
-            up_lr_weight = up_lr_weight + [1.0] * (max_len - len(up_lr_weight))
+    if mid_lr_weight != None and len(mid_lr_weight) < max_len_for_mid:
+        logger.warning("mid_weight is too short. Parameters after %d-th are filled with 1." % max_len_for_mid)
+        logger.warning("mid_weightが短すぎます。%d個目までの不足したパラメータは1で補われます。" % max_len_for_mid)
+        mid_lr_weight = mid_lr_weight + [1.0] * (max_len_for_mid - len(mid_lr_weight))
 
     if (up_lr_weight != None) or (mid_lr_weight != None) or (down_lr_weight != None):
         logger.info("apply block learning rate / 階層別学習率を適用します。")
@@ -635,72 +672,84 @@ def get_list(name_with_suffix) -> List[float]:
             down_lr_weight = [w if w > zero_threshold else 0 for w in down_lr_weight]
             logger.info(f"down_lr_weight (shallower -> deeper, 浅い層->深い層): {down_lr_weight}")
         else:
+            down_lr_weight = [1.0] * max_len_for_down_or_up
             logger.info("down_lr_weight: all 1.0, すべて1.0")
 
         if mid_lr_weight != None:
-            mid_lr_weight = mid_lr_weight if mid_lr_weight > zero_threshold else 0
+            mid_lr_weight = [w if w > zero_threshold else 0 for w in mid_lr_weight]
             logger.info(f"mid_lr_weight: {mid_lr_weight}")
         else:
-            logger.info("mid_lr_weight: 1.0")
+            mid_lr_weight = [1.0] * max_len_for_mid
+            logger.info("mid_lr_weight: all 1.0, すべて1.0")
 
         if up_lr_weight != None:
             up_lr_weight = [w if w > zero_threshold else 0 for w in up_lr_weight]
             logger.info(f"up_lr_weight (deeper -> shallower, 深い層->浅い層): {up_lr_weight}")
         else:
+            up_lr_weight = [1.0] * max_len_for_down_or_up
             logger.info("up_lr_weight: all 1.0, すべて1.0")
 
-    return down_lr_weight, mid_lr_weight, up_lr_weight
+    lr_weight = down_lr_weight + mid_lr_weight + up_lr_weight
+
+    if is_sdxl:
+        lr_weight = [1.0] + lr_weight + [1.0]  # add 1.0 for emb_layers and out
+
+    assert (not is_sdxl and len(lr_weight) == LoRANetwork.NUM_OF_BLOCKS * 2 + LoRANetwork.NUM_OF_MID_BLOCKS) or (
+        is_sdxl and len(lr_weight) == 1 + LoRANetwork.SDXL_NUM_OF_BLOCKS * 2 + LoRANetwork.SDXL_NUM_OF_MID_BLOCKS + 1
+    ), f"lr_weight length is invalid: {len(lr_weight)}"
+
+    return lr_weight
 
 
 # lr_weightが0のblockをblock_dimsから除外する、外部から呼び出す可能性を考慮しておく
 def remove_block_dims_and_alphas(
-    block_dims, block_alphas, conv_block_dims, conv_block_alphas, down_lr_weight, mid_lr_weight, up_lr_weight
+    is_sdxl, block_dims, block_alphas, conv_block_dims, conv_block_alphas, block_lr_weight: Optional[List[float]]
 ):
-    # set 0 to block dim without learning rate to remove the block
-    if down_lr_weight != None:
-        for i, lr in enumerate(down_lr_weight):
+    if block_lr_weight is not None:
+        for i, lr in enumerate(block_lr_weight):
             if lr == 0:
                 block_dims[i] = 0
                 if conv_block_dims is not None:
                     conv_block_dims[i] = 0
-    if mid_lr_weight != None:
-        if mid_lr_weight == 0:
-            block_dims[LoRANetwork.NUM_OF_BLOCKS] = 0
-            if conv_block_dims is not None:
-                conv_block_dims[LoRANetwork.NUM_OF_BLOCKS] = 0
-    if up_lr_weight != None:
-        for i, lr in enumerate(up_lr_weight):
-            if lr == 0:
-                block_dims[LoRANetwork.NUM_OF_BLOCKS + 1 + i] = 0
-                if conv_block_dims is not None:
-                    conv_block_dims[LoRANetwork.NUM_OF_BLOCKS + 1 + i] = 0
-
     return block_dims, block_alphas, conv_block_dims, conv_block_alphas
 
 
 # 外部から呼び出す可能性を考慮しておく
-def get_block_index(lora_name: str) -> int:
+def get_block_index(lora_name: str, is_sdxl: bool = False) -> int:
     block_idx = -1  # invalid lora name
-
-    m = RE_UPDOWN.search(lora_name)
-    if m:
-        g = m.groups()
-        i = int(g[1])
-        j = int(g[3])
-        if g[2] == "resnets":
-            idx = 3 * i + j
-        elif g[2] == "attentions":
-            idx = 3 * i + j
-        elif g[2] == "upsamplers" or g[2] == "downsamplers":
-            idx = 3 * i + 2
-
-        if g[0] == "down":
-            block_idx = 1 + idx  # 0に該当するLoRAは存在しない
-        elif g[0] == "up":
-            block_idx = LoRANetwork.NUM_OF_BLOCKS + 1 + idx
-
-    elif "mid_block_" in lora_name:
-        block_idx = LoRANetwork.NUM_OF_BLOCKS  # idx=12
+    if not is_sdxl:
+        m = RE_UPDOWN.search(lora_name)
+        if m:
+            g = m.groups()
+            i = int(g[1])
+            j = int(g[3])
+            if g[2] == "resnets":
+                idx = 3 * i + j
+            elif g[2] == "attentions":
+                idx = 3 * i + j
+            elif g[2] == "upsamplers" or g[2] == "downsamplers":
+                idx = 3 * i + 2
+
+            if g[0] == "down":
+                block_idx = 1 + idx  # 0に該当するLoRAは存在しない
+            elif g[0] == "up":
+                block_idx = LoRANetwork.NUM_OF_BLOCKS + 1 + idx
+        elif "mid_block_" in lora_name:
+            block_idx = LoRANetwork.NUM_OF_BLOCKS  # idx=12
+    else:
+        # copy from sdxl_train
+        if lora_name.startswith("lora_unet_"):
+            name = lora_name[len("lora_unet_") :]
+            if name.startswith("time_embed_") or name.startswith("label_emb_"):  # No LoRA
+                block_idx = 0  # 0
+            elif name.startswith("input_blocks_"):  # 1-9
+                block_idx = 1 + int(name.split("_")[2])
+            elif name.startswith("middle_block_"):  # 10-12
+                block_idx = 10 + int(name.split("_")[2])
+            elif name.startswith("output_blocks_"):  # 13-21
+                block_idx = 13 + int(name.split("_")[2])
+            elif name.startswith("out_"):  # 22, out, no LoRA
+                block_idx = 22
 
     return block_idx
 
@@ -742,15 +791,18 @@ def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weigh
     )
 
     # block lr
-    down_lr_weight, mid_lr_weight, up_lr_weight = parse_block_lr_kwargs(kwargs)
-    if up_lr_weight is not None or mid_lr_weight is not None or down_lr_weight is not None:
-        network.set_block_lr_weight(up_lr_weight, mid_lr_weight, down_lr_weight)
+    block_lr_weight = parse_block_lr_kwargs(kwargs)
+    if block_lr_weight is not None:
+        network.set_block_lr_weight(block_lr_weight)
 
     return network, weights_sd
 
 
 class LoRANetwork(torch.nn.Module):
     NUM_OF_BLOCKS = 12  # フルモデル相当でのup,downの層の数
+    NUM_OF_MID_BLOCKS = 1
+    SDXL_NUM_OF_BLOCKS = 9  # SDXLのモデルでのinput/outputの層の数 total=1(base) 9(input) + 3(mid) + 9(output) + 1(out) = 23
+    SDXL_NUM_OF_MID_BLOCKS = 3
 
     UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel"]
     UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"]
@@ -782,6 +834,7 @@ def __init__(
         modules_alpha: Optional[Dict[str, int]] = None,
         module_class: Type[object] = LoRAModule,
         varbose: Optional[bool] = False,
+        is_sdxl: Optional[bool] = False,
     ) -> None:
         """
         LoRA network: すごく引数が多いが、パターンは以下の通り
@@ -863,7 +916,7 @@ def create_modules(
                                     alpha = modules_alpha[lora_name]
                             elif is_unet and block_dims is not None:
                                 # U-Netでblock_dims指定あり
-                                block_idx = get_block_index(lora_name)
+                                block_idx = get_block_index(lora_name, is_sdxl)
                                 if is_linear or is_conv2d_1x1:
                                     dim = block_dims[block_idx]
                                     alpha = block_alphas[block_idx]
@@ -927,15 +980,13 @@ def create_modules(
 
         skipped = skipped_te + skipped_un
         if varbose and len(skipped) > 0:
-            logger.warning(
+            logger.warn(
                 f"because block_lr_weight is 0 or dim (rank) is 0, {len(skipped)} LoRA modules are skipped / block_lr_weightまたはdim (rank)が0の為、次の{len(skipped)}個のLoRAモジュールはスキップされます:"
             )
             for name in skipped:
                 logger.info(f"\t{name}")
 
-        self.up_lr_weight: List[float] = None
-        self.down_lr_weight: List[float] = None
-        self.mid_lr_weight: float = None
+        self.block_lr_weight = None
         self.block_lr = False
 
         # assertion
@@ -966,12 +1017,12 @@ def load_weights(self, file):
 
     def apply_to(self, text_encoder, unet, apply_text_encoder=True, apply_unet=True):
         if apply_text_encoder:
-            logger.info("enable LoRA for text encoder")
+            logger.info(f"enable LoRA for text encoder: {len(self.text_encoder_loras)} modules")
         else:
             self.text_encoder_loras = []
 
         if apply_unet:
-            logger.info("enable LoRA for U-Net")
+            logger.info(f"enable LoRA for U-Net: {len(self.unet_loras)} modules")
         else:
             self.unet_loras = []
 
@@ -1012,34 +1063,14 @@ def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
         logger.info(f"weights are merged")
 
     # 層別学習率用に層ごとの学習率に対する倍率を定義する　引数の順番が逆だがとりあえず気にしない
-    def set_block_lr_weight(
-        self,
-        up_lr_weight: List[float] = None,
-        mid_lr_weight: float = None,
-        down_lr_weight: List[float] = None,
-    ):
+    def set_block_lr_weight(self, block_lr_weight: Optional[List[float]]):
         self.block_lr = True
-        self.down_lr_weight = down_lr_weight
-        self.mid_lr_weight = mid_lr_weight
-        self.up_lr_weight = up_lr_weight
-
-    def get_lr_weight(self, lora: LoRAModule) -> float:
-        lr_weight = 1.0
-        block_idx = get_block_index(lora.lora_name)
-        if block_idx < 0:
-            return lr_weight
-
-        if block_idx < LoRANetwork.NUM_OF_BLOCKS:
-            if self.down_lr_weight != None:
-                lr_weight = self.down_lr_weight[block_idx]
-        elif block_idx == LoRANetwork.NUM_OF_BLOCKS:
-            if self.mid_lr_weight != None:
-                lr_weight = self.mid_lr_weight
-        elif block_idx > LoRANetwork.NUM_OF_BLOCKS:
-            if self.up_lr_weight != None:
-                lr_weight = self.up_lr_weight[block_idx - LoRANetwork.NUM_OF_BLOCKS - 1]
-
-        return lr_weight
+        self.block_lr_weight = block_lr_weight
+
+    def get_lr_weight(self, block_idx: int) -> float:
+        if not self.block_lr or self.block_lr_weight is None:
+            return 1.0
+        return self.block_lr_weight[block_idx]
 
     def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
         self.loraplus_lr_ratio = loraplus_lr_ratio
@@ -1106,10 +1137,16 @@ def assemble_params(loras, lr, ratio):
 
         if self.unet_loras:
             if self.block_lr:
+                is_sdxl = False
+                for lora in self.unet_loras:
+                    if "input_blocks" in lora.lora_name or "output_blocks" in lora.lora_name:
+                        is_sdxl = True
+                        break
+
                 # 学習率のグラフをblockごとにしたいので、blockごとにloraを分類
                 block_idx_to_lora = {}
                 for lora in self.unet_loras:
-                    idx = get_block_index(lora.lora_name)
+                    idx = get_block_index(lora.lora_name, is_sdxl)
                     if idx not in block_idx_to_lora:
                         block_idx_to_lora[idx] = []
                     block_idx_to_lora[idx].append(lora)
@@ -1118,7 +1155,7 @@ def assemble_params(loras, lr, ratio):
                 for idx, block_loras in block_idx_to_lora.items():
                     params, descriptions = assemble_params(
                         block_loras,
-                        (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(block_loras[0]),
+                        (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(idx),
                         self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio,
                     )
                     all_params.extend(params)
diff --git a/train_network.py b/train_network.py
index c43241e8d..2976f7635 100644
--- a/train_network.py
+++ b/train_network.py
@@ -346,13 +346,13 @@ def train(self, args):
             else:
                 trainable_params = results
                 lr_descriptions = None
-        except TypeError:
+        except TypeError as e:
+            # logger.warning(f"{e}")
             # accelerator.print(
             #     "Deprecated: use prepare_optimizer_params(text_encoder_lr, unet_lr, learning_rate) instead of prepare_optimizer_params(text_encoder_lr, unet_lr)"
             # )
             trainable_params = network.prepare_optimizer_params(args.text_encoder_lr, args.unet_lr)
             lr_descriptions = None
-        print(lr_descriptions)
 
         optimizer_name, optimizer_args, optimizer = train_util.get_optimizer(args, trainable_params)
 

From 52e64c69cf249a7bc4ca6f4eebe82bc1b70e617b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 4 May 2024 18:43:52 +0900
Subject: [PATCH 097/132] add debug log

---
 train_network.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/train_network.py b/train_network.py
index 2976f7635..feb455cea 100644
--- a/train_network.py
+++ b/train_network.py
@@ -354,6 +354,16 @@ def train(self, args):
             trainable_params = network.prepare_optimizer_params(args.text_encoder_lr, args.unet_lr)
             lr_descriptions = None
 
+        # if len(trainable_params) == 0:
+        #     accelerator.print("no trainable parameters found / 学習可能なパラメータが見つかりませんでした")
+        # for params in trainable_params:
+        #     for k, v in params.items():
+        #         if type(v) == float:
+        #             pass
+        #         else:
+        #             v = len(v)
+        #         accelerator.print(f"trainable_params: {k} = {v}")
+
         optimizer_name, optimizer_args, optimizer = train_util.get_optimizer(args, trainable_params)
 
         # dataloaderを準備する

From 7fe81502d04c1f68c85f276517e7144e6378c484 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 6 May 2024 11:09:32 +0900
Subject: [PATCH 098/132] update loraplus on dylora/lofa_fa

---
 networks/dylora.py  | 46 ++++++++++++++++++++++++---------------
 networks/lora.py    |  7 +++++-
 networks/lora_fa.py | 52 +++++++++++++++++++++++++++++++--------------
 3 files changed, 71 insertions(+), 34 deletions(-)

diff --git a/networks/dylora.py b/networks/dylora.py
index 0546fc7ae..0d1701ded 100644
--- a/networks/dylora.py
+++ b/networks/dylora.py
@@ -18,10 +18,13 @@
 import torch
 from torch import nn
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 class DyLoRAModule(torch.nn.Module):
     """
     replaces forward method of the original Linear, instead of replacing the original Linear module.
@@ -195,7 +198,7 @@ def create_network(
             conv_alpha = 1.0
         else:
             conv_alpha = float(conv_alpha)
-            
+
     if unit is not None:
         unit = int(unit)
     else:
@@ -211,6 +214,16 @@ def create_network(
         unit=unit,
         varbose=True,
     )
+
+    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
+    loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
+    loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
+    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
+    loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
+    loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
+    if loraplus_lr_ratio is not None or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
+        network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
+
     return network
 
 
@@ -280,6 +293,10 @@ def __init__(
         self.alpha = alpha
         self.apply_to_conv = apply_to_conv
 
+        self.loraplus_lr_ratio = None
+        self.loraplus_unet_lr_ratio = None
+        self.loraplus_text_encoder_lr_ratio = None
+
         if modules_dim is not None:
             logger.info("create LoRA network from weights")
         else:
@@ -320,9 +337,9 @@ def create_modules(is_unet, root_module: torch.nn.Module, target_replace_modules
                             lora = module_class(lora_name, child_module, self.multiplier, dim, alpha, unit)
                             loras.append(lora)
             return loras
-        
+
         text_encoders = text_encoder if type(text_encoder) == list else [text_encoder]
-        
+
         self.text_encoder_loras = []
         for i, text_encoder in enumerate(text_encoders):
             if len(text_encoders) > 1:
@@ -331,7 +348,7 @@ def create_modules(is_unet, root_module: torch.nn.Module, target_replace_modules
             else:
                 index = None
                 logger.info("create LoRA for Text Encoder")
-            
+
             text_encoder_loras = create_modules(False, text_encoder, DyLoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
             self.text_encoder_loras.extend(text_encoder_loras)
 
@@ -346,6 +363,11 @@ def create_modules(is_unet, root_module: torch.nn.Module, target_replace_modules
         self.unet_loras = create_modules(True, unet, target_modules)
         logger.info(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
 
+    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
+        self.loraplus_lr_ratio = loraplus_lr_ratio
+        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
+        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
+
     def set_multiplier(self, multiplier):
         self.multiplier = multiplier
         for lora in self.text_encoder_loras + self.unet_loras:
@@ -407,15 +429,7 @@ def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
     """
 
     # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
-    def prepare_optimizer_params(
-        self,
-        text_encoder_lr,
-        unet_lr,
-        default_lr,
-        text_encoder_loraplus_ratio=None,
-        unet_loraplus_ratio=None,
-        loraplus_ratio=None
-    ):
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
         self.requires_grad_(True)
         all_params = []
 
@@ -452,15 +466,13 @@ def assemble_params(loras, lr, ratio):
             params = assemble_params(
                 self.text_encoder_loras,
                 text_encoder_lr if text_encoder_lr is not None else default_lr,
-                text_encoder_loraplus_ratio or loraplus_ratio
+                self.loraplus_text_encoder_lr_ratio or self.loraplus_ratio,
             )
             all_params.extend(params)
 
         if self.unet_loras:
             params = assemble_params(
-                self.unet_loras,
-                default_lr if unet_lr is None else unet_lr,
-                unet_loraplus_ratio or loraplus_ratio
+                self.unet_loras, default_lr if unet_lr is None else unet_lr, self.loraplus_unet_lr_ratio or self.loraplus_ratio
             )
             all_params.extend(params)
 
diff --git a/networks/lora.py b/networks/lora.py
index 61b8cd5a7..6e5645577 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -499,7 +499,8 @@ def create_network(
     loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
     loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
     loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
-    network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
+    if loraplus_lr_ratio is not None or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
+        network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
 
     if block_lr_weight is not None:
         network.set_block_lr_weight(block_lr_weight)
@@ -855,6 +856,10 @@ def __init__(
         self.rank_dropout = rank_dropout
         self.module_dropout = module_dropout
 
+        self.loraplus_lr_ratio = None
+        self.loraplus_unet_lr_ratio = None
+        self.loraplus_text_encoder_lr_ratio = None
+
         if modules_dim is not None:
             logger.info(f"create LoRA network from weights")
         elif block_dims is not None:
diff --git a/networks/lora_fa.py b/networks/lora_fa.py
index 9a608118a..58bcb2206 100644
--- a/networks/lora_fa.py
+++ b/networks/lora_fa.py
@@ -15,8 +15,10 @@
 import torch
 import re
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 RE_UPDOWN = re.compile(r"(up|down)_blocks_(\d+)_(resnets|upsamplers|downsamplers|attentions)_(\d+)_")
@@ -504,6 +506,15 @@ def create_network(
     if up_lr_weight is not None or mid_lr_weight is not None or down_lr_weight is not None:
         network.set_block_lr_weight(up_lr_weight, mid_lr_weight, down_lr_weight)
 
+    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
+    loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
+    loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
+    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
+    loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
+    loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
+    if loraplus_lr_ratio is not None or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
+        network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
+
     return network
 
 
@@ -529,7 +540,9 @@ def parse_floats(s):
             len(block_dims) == num_total_blocks
         ), f"block_dims must have {num_total_blocks} elements / block_dimsは{num_total_blocks}個指定してください"
     else:
-        logger.warning(f"block_dims is not specified. all dims are set to {network_dim} / block_dimsが指定されていません。すべてのdimは{network_dim}になります")
+        logger.warning(
+            f"block_dims is not specified. all dims are set to {network_dim} / block_dimsが指定されていません。すべてのdimは{network_dim}になります"
+        )
         block_dims = [network_dim] * num_total_blocks
 
     if block_alphas is not None:
@@ -803,11 +816,17 @@ def __init__(
         self.rank_dropout = rank_dropout
         self.module_dropout = module_dropout
 
+        self.loraplus_lr_ratio = None
+        self.loraplus_unet_lr_ratio = None
+        self.loraplus_text_encoder_lr_ratio = None
+
         if modules_dim is not None:
             logger.info(f"create LoRA network from weights")
         elif block_dims is not None:
             logger.info(f"create LoRA network from block_dims")
-            logger.info(f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}")
+            logger.info(
+                f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
+            )
             logger.info(f"block_dims: {block_dims}")
             logger.info(f"block_alphas: {block_alphas}")
             if conv_block_dims is not None:
@@ -815,9 +834,13 @@ def __init__(
                 logger.info(f"conv_block_alphas: {conv_block_alphas}")
         else:
             logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
-            logger.info(f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}")
+            logger.info(
+                f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
+            )
             if self.conv_lora_dim is not None:
-                logger.info(f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}")
+                logger.info(
+                    f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
+                )
 
         # create module instances
         def create_modules(
@@ -939,6 +962,11 @@ def create_modules(
             assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
             names.add(lora.lora_name)
 
+    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
+        self.loraplus_lr_ratio = loraplus_lr_ratio
+        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
+        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
+
     def set_multiplier(self, multiplier):
         self.multiplier = multiplier
         for lora in self.text_encoder_loras + self.unet_loras:
@@ -1033,15 +1061,7 @@ def get_lr_weight(self, lora: LoRAModule) -> float:
         return lr_weight
 
     # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
-    def prepare_optimizer_params(
-        self,
-        text_encoder_lr,
-        unet_lr,
-        default_lr,
-        text_encoder_loraplus_ratio=None,
-        unet_loraplus_ratio=None,
-        loraplus_ratio=None
-    ):
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
         self.requires_grad_(True)
         all_params = []
 
@@ -1078,7 +1098,7 @@ def assemble_params(loras, lr, ratio):
             params = assemble_params(
                 self.text_encoder_loras,
                 text_encoder_lr if text_encoder_lr is not None else default_lr,
-                text_encoder_loraplus_ratio or loraplus_ratio
+                self.loraplus_text_encoder_lr_ratio or self.loraplus_ratio,
             )
             all_params.extend(params)
 
@@ -1097,7 +1117,7 @@ def assemble_params(loras, lr, ratio):
                     params = assemble_params(
                         block_loras,
                         (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(block_loras[0]),
-                        unet_loraplus_ratio or loraplus_ratio
+                        self.loraplus_unet_lr_ratio or self.loraplus_ratio,
                     )
                     all_params.extend(params)
 
@@ -1105,7 +1125,7 @@ def assemble_params(loras, lr, ratio):
                 params = assemble_params(
                     self.unet_loras,
                     unet_lr if unet_lr is not None else default_lr,
-                    unet_loraplus_ratio or loraplus_ratio
+                    self.loraplus_unet_lr_ratio or self.loraplus_ratio,
                 )
                 all_params.extend(params)
 

From 3fd8cdc55d7d87ceca2dc1127a807a7ddafb15ae Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 6 May 2024 14:03:19 +0900
Subject: [PATCH 099/132] fix dylora loraplus

---
 networks/dylora.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/networks/dylora.py b/networks/dylora.py
index 0d1701ded..d57e3d580 100644
--- a/networks/dylora.py
+++ b/networks/dylora.py
@@ -466,13 +466,13 @@ def assemble_params(loras, lr, ratio):
             params = assemble_params(
                 self.text_encoder_loras,
                 text_encoder_lr if text_encoder_lr is not None else default_lr,
-                self.loraplus_text_encoder_lr_ratio or self.loraplus_ratio,
+                self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio,
             )
             all_params.extend(params)
 
         if self.unet_loras:
             params = assemble_params(
-                self.unet_loras, default_lr if unet_lr is None else unet_lr, self.loraplus_unet_lr_ratio or self.loraplus_ratio
+                self.unet_loras, default_lr if unet_lr is None else unet_lr, self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio
             )
             all_params.extend(params)
 

From 017b82ebe33a2199c8f842c99905f59c54292f56 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 6 May 2024 15:05:42 +0900
Subject: [PATCH 100/132] update help message for fused_backward_pass

---
 library/train_util.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 46b55c03e..e3c0229a7 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2923,7 +2923,8 @@ def add_optimizer_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--fused_backward_pass",
         action="store_true",
-        help="Combines backward pass and optimizer step to reduce VRAM usage / バックワードパスとオプティマイザステップを組み合わせてVRAMの使用量を削減します。",
+        help="Combines backward pass and optimizer step to reduce VRAM usage. Only available in SDXL"
+        + " / バックワードパスとオプティマイザステップを組み合わせてVRAMの使用量を削減します。SDXLでのみ有効",
     )
 
 

From b56d5f7801dea45cdbbba8498544e8d2853ad6d6 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 6 May 2024 21:35:39 +0900
Subject: [PATCH 101/132] add experimental option to fuse params to optimizer
 groups

---
 sdxl_train.py | 114 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 104 insertions(+), 10 deletions(-)

diff --git a/sdxl_train.py b/sdxl_train.py
index 3b28575ed..c7eea2224 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -345,8 +345,8 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     # calculate number of trainable parameters
     n_params = 0
-    for params in params_to_optimize:
-        for p in params["params"]:
+    for group in params_to_optimize:
+        for p in group["params"]:
             n_params += p.numel()
 
     accelerator.print(f"train unet: {train_unet}, text_encoder1: {train_text_encoder1}, text_encoder2: {train_text_encoder2}")
@@ -355,7 +355,44 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     # 学習に必要なクラスを準備する
     accelerator.print("prepare optimizer, data loader etc.")
-    _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
+
+    if args.fused_optimizer_groups:
+        # calculate total number of parameters
+        n_total_params = sum(len(params["params"]) for params in params_to_optimize)
+        params_per_group = math.ceil(n_total_params / args.fused_optimizer_groups)
+
+        # split params into groups
+        grouped_params = []
+        param_group = []
+        param_group_lr = -1
+        for group in params_to_optimize:
+            lr = group["lr"]
+            for p in group["params"]:
+                if lr != param_group_lr:
+                    if param_group:
+                        grouped_params.append({"params": param_group, "lr": param_group_lr})
+                        param_group = []
+                    param_group_lr = lr
+                param_group.append(p)
+                if len(param_group) == params_per_group:
+                    grouped_params.append({"params": param_group, "lr": param_group_lr})
+                    param_group = []
+                    param_group_lr = -1
+        if param_group:
+            grouped_params.append({"params": param_group, "lr": param_group_lr})
+
+        # prepare optimizers for each group
+        optimizers = []
+        for group in grouped_params:
+            _, _, optimizer = train_util.get_optimizer(args, trainable_params=[group])
+            optimizers.append(optimizer)
+        optimizer = optimizers[0]  # avoid error in the following code
+
+        print(len(grouped_params))
+        logger.info(f"using {len(optimizers)} optimizers for fused optimizer groups")
+
+    else:
+        _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
 
     # dataloaderを準備する
     # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
@@ -382,7 +419,11 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     train_dataset_group.set_max_train_steps(args.max_train_steps)
 
     # lr schedulerを用意する
-    lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
+    if args.fused_optimizer_groups:
+        lr_schedulers = [train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes) for optimizer in optimizers]
+        lr_scheduler = lr_schedulers[0]  # avoid error in the following code
+    else:
+        lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
 
     # 実験的機能：勾配も含めたfp16/bf16学習を行う　モデル全体をfp16/bf16にする
     if args.full_fp16:
@@ -432,10 +473,12 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     if args.fused_backward_pass:
         import library.adafactor_fused
+
         library.adafactor_fused.patch_adafactor_fused(optimizer)
         for param_group in optimizer.param_groups:
             for parameter in param_group["params"]:
                 if parameter.requires_grad:
+
                     def __grad_hook(tensor: torch.Tensor, param_group=param_group):
                         if accelerator.sync_gradients and args.max_grad_norm != 0.0:
                             accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
@@ -444,6 +487,36 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
 
                     parameter.register_post_accumulate_grad_hook(__grad_hook)
 
+    elif args.fused_optimizer_groups:
+        for i in range(1, len(optimizers)):
+            optimizers[i] = accelerator.prepare(optimizers[i])
+            lr_schedulers[i] = accelerator.prepare(lr_schedulers[i])
+
+        global optimizer_hooked_count
+        global num_parameters_per_group
+        global parameter_optimizer_map
+        optimizer_hooked_count = {}
+        num_parameters_per_group = [0] * len(optimizers)
+        parameter_optimizer_map = {}
+        for opt_idx, optimizer in enumerate(optimizers):
+            for param_group in optimizer.param_groups:
+                for parameter in param_group["params"]:
+                    if parameter.requires_grad:
+
+                        def optimizer_hook(parameter: torch.Tensor):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
+
+                            i = parameter_optimizer_map[parameter]
+                            optimizer_hooked_count[i] += 1
+                            if optimizer_hooked_count[i] == num_parameters_per_group[i]:
+                                optimizers[i].step()
+                                optimizers[i].zero_grad()
+
+                        parameter.register_post_accumulate_grad_hook(optimizer_hook)
+                        parameter_optimizer_map[parameter] = opt_idx
+                        num_parameters_per_group[opt_idx] += 1
+
     # TextEncoderの出力をキャッシュするときにはCPUへ移動する
     if args.cache_text_encoder_outputs:
         # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
@@ -518,6 +591,10 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
 
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
+
+            if args.fused_optimizer_groups:
+                optimizer_hooked_count = {i: 0 for i in range(len(optimizers))}
+
             with accelerator.accumulate(*training_models):
                 if "latents" in batch and batch["latents"] is not None:
                     latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
@@ -596,7 +673,9 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                    args, noise_scheduler, latents
+                )
 
                 noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
@@ -614,7 +693,9 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
                     or args.masked_loss
                 ):
                     # do not mean over batch dimension for snr weight or scale v-pred loss
-                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
+                    loss = train_util.conditional_loss(
+                        noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    )
                     if args.masked_loss:
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
@@ -630,11 +711,13 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
 
                     loss = loss.mean()  # mean over batch dimension
                 else:
-                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c)
+                    loss = train_util.conditional_loss(
+                        noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                    )
 
                 accelerator.backward(loss)
 
-                if not args.fused_backward_pass:
+                if not (args.fused_backward_pass or args.fused_optimizer_groups):
                     if accelerator.sync_gradients and args.max_grad_norm != 0.0:
                         params_to_clip = []
                         for m in training_models:
@@ -642,9 +725,14 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
                         accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                     optimizer.step()
+                elif args.fused_optimizer_groups:
+                    for i in range(1, len(optimizers)):
+                        lr_schedulers[i].step()
 
                 lr_scheduler.step()
-                optimizer.zero_grad(set_to_none=True)
+
+                if not (args.fused_backward_pass or args.fused_optimizer_groups):
+                    optimizer.zero_grad(set_to_none=True)
 
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
@@ -753,7 +841,7 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
 
     accelerator.end_training()
 
-    if args.save_state or args.save_state_on_train_end:        
+    if args.save_state or args.save_state_on_train_end:
         train_util.save_state_on_train_end(args, accelerator)
 
     del accelerator  # この後メモリを使うのでこれは消す
@@ -822,6 +910,12 @@ def setup_parser() -> argparse.ArgumentParser:
         help=f"learning rates for each block of U-Net, comma-separated, {UNET_NUM_BLOCKS_FOR_BLOCK_LR} values / "
         + f"U-Netの各ブロックの学習率、カンマ区切り、{UNET_NUM_BLOCKS_FOR_BLOCK_LR}個の値",
     )
+    parser.add_argument(
+        "--fused_optimizer_groups",
+        type=int,
+        default=None,
+        help="number of optimizers for fused backward pass and optimizer step / fused backward passとoptimizer stepのためのoptimizer数",
+    )
     return parser
 
 

From 793aeb94da53565fb08c7b0b2538f2ade04824bb Mon Sep 17 00:00:00 2001
From: AngelBottomless <aria1th@naver.com>
Date: Tue, 7 May 2024 18:21:31 +0900
Subject: [PATCH 102/132] fix get_trainable_params in controlnet-llite training

---
 sdxl_train_control_net_lllite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index f89c3628f..6ad6e763c 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -477,7 +477,7 @@ def remove_model(old_ckpt_name):
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                    params_to_clip = unet.get_trainable_params()
+                    params_to_clip = accelerator.unwrap_model(unet).get_trainable_params()
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                 optimizer.step()

From 607e041f3de972f2c3030e7c8b43dfc3c2eb2d65 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 12 May 2024 14:16:41 +0900
Subject: [PATCH 103/132] chore: Refactor optimizer group

---
 sdxl_train.py | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/sdxl_train.py b/sdxl_train.py
index c7eea2224..be2b7166e 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -357,27 +357,37 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     accelerator.print("prepare optimizer, data loader etc.")
 
     if args.fused_optimizer_groups:
+        # fused backward pass: https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html
+        # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each group of parameters.
+        # This balances memory usage and management complexity.
+
         # calculate total number of parameters
         n_total_params = sum(len(params["params"]) for params in params_to_optimize)
         params_per_group = math.ceil(n_total_params / args.fused_optimizer_groups)
 
-        # split params into groups
+        # split params into groups, keeping the learning rate the same for all params in a group
+        # this will increase the number of groups if the learning rate is different for different params (e.g. U-Net and text encoders)
         grouped_params = []
         param_group = []
         param_group_lr = -1
         for group in params_to_optimize:
             lr = group["lr"]
             for p in group["params"]:
+                # if the learning rate is different for different params, start a new group
                 if lr != param_group_lr:
                     if param_group:
                         grouped_params.append({"params": param_group, "lr": param_group_lr})
                         param_group = []
                     param_group_lr = lr
+
                 param_group.append(p)
+
+                # if the group has enough parameters, start a new group
                 if len(param_group) == params_per_group:
                     grouped_params.append({"params": param_group, "lr": param_group_lr})
                     param_group = []
                     param_group_lr = -1
+
         if param_group:
             grouped_params.append({"params": param_group, "lr": param_group_lr})
 
@@ -388,7 +398,6 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             optimizers.append(optimizer)
         optimizer = optimizers[0]  # avoid error in the following code
 
-        print(len(grouped_params))
         logger.info(f"using {len(optimizers)} optimizers for fused optimizer groups")
 
     else:
@@ -420,6 +429,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     # lr schedulerを用意する
     if args.fused_optimizer_groups:
+        # prepare lr schedulers for each optimizer
         lr_schedulers = [train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes) for optimizer in optimizers]
         lr_scheduler = lr_schedulers[0]  # avoid error in the following code
     else:
@@ -472,6 +482,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
     if args.fused_backward_pass:
+        # use fused optimizer for backward pass: other optimizers will be supported in the future
         import library.adafactor_fused
 
         library.adafactor_fused.patch_adafactor_fused(optimizer)
@@ -488,16 +499,20 @@ def __grad_hook(tensor: torch.Tensor, param_group=param_group):
                     parameter.register_post_accumulate_grad_hook(__grad_hook)
 
     elif args.fused_optimizer_groups:
+        # prepare for additional optimizers and lr schedulers
         for i in range(1, len(optimizers)):
             optimizers[i] = accelerator.prepare(optimizers[i])
             lr_schedulers[i] = accelerator.prepare(lr_schedulers[i])
 
+        # counters are used to determine when to step the optimizer
         global optimizer_hooked_count
         global num_parameters_per_group
         global parameter_optimizer_map
+
         optimizer_hooked_count = {}
         num_parameters_per_group = [0] * len(optimizers)
         parameter_optimizer_map = {}
+
         for opt_idx, optimizer in enumerate(optimizers):
             for param_group in optimizer.param_groups:
                 for parameter in param_group["params"]:
@@ -511,7 +526,7 @@ def optimizer_hook(parameter: torch.Tensor):
                             optimizer_hooked_count[i] += 1
                             if optimizer_hooked_count[i] == num_parameters_per_group[i]:
                                 optimizers[i].step()
-                                optimizers[i].zero_grad()
+                                optimizers[i].zero_grad(set_to_none=True)
 
                         parameter.register_post_accumulate_grad_hook(optimizer_hook)
                         parameter_optimizer_map[parameter] = opt_idx
@@ -593,7 +608,7 @@ def optimizer_hook(parameter: torch.Tensor):
             current_step.value = global_step
 
             if args.fused_optimizer_groups:
-                optimizer_hooked_count = {i: 0 for i in range(len(optimizers))}
+                optimizer_hooked_count = {i: 0 for i in range(len(optimizers))}  # reset counter for each step
 
             with accelerator.accumulate(*training_models):
                 if "latents" in batch and batch["latents"] is not None:
@@ -725,14 +740,14 @@ def optimizer_hook(parameter: torch.Tensor):
                         accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                     optimizer.step()
-                elif args.fused_optimizer_groups:
-                    for i in range(1, len(optimizers)):
-                        lr_schedulers[i].step()
-
-                lr_scheduler.step()
-
-                if not (args.fused_backward_pass or args.fused_optimizer_groups):
+                    lr_scheduler.step()
                     optimizer.zero_grad(set_to_none=True)
+                else:
+                    # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
+                    lr_scheduler.step()
+                    if args.fused_optimizer_groups:
+                        for i in range(1, len(optimizers)):
+                            lr_schedulers[i].step()
 
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:

From c1ba0b4356637c881ea99663fcce5943fc33fc56 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 12 May 2024 14:21:10 +0900
Subject: [PATCH 104/132] update readme

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index a7047a360..859a7618d 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,14 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ## Change History
 
+### Working in progress
+
+- Fixed some bugs when using DeepSpeed. Related [#1247]
+
+
+- DeepSpeed 使用時のいくつかのバグを修正しました。関連 [#1247]
+
+
 ### Apr 7, 2024 / 2024-04-07: v0.8.7
 
 - The default value of `huber_schedule` in Scheduled Huber Loss is changed from `exponential` to `snr`, which is expected to give better results.

From f3d2cf22ff9ad49e7f8bd68494714fa3bedbd77d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 12 May 2024 15:03:02 +0900
Subject: [PATCH 105/132] update README for fused optimizer

---
 README.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/README.md b/README.md
index 859a7618d..4fd97fb25 100644
--- a/README.md
+++ b/README.md
@@ -139,8 +139,37 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ### Working in progress
 
+- Fused optimizer is available for SDXL training. PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) Thanks to 2kpr!
+  - The memory usage during training is significantly reduced by integrating the optimizer's backward pass with step. The training results are the same as before, but if you have plenty of memory, the speed will be slower.
+  - Specify the `--fused_backward_pass` option in `sdxl_train.py`. At this time, only AdaFactor is supported. Gradient accumulation is not available.
+  - Setting mixed precision to `no` seems to use less memory than `fp16` or `bf16`.
+  - Training is possible with a memory usage of about 17GB with a batch size of 1 and fp32. If you specify the `--full_bf16` option, you can further reduce the memory usage (but the accuracy will be lower). With the same memory usage as before, you can increase the batch size.
+  - PyTorch 2.1 or later is required because it uses the new API `Tensor.register_post_accumulate_grad_hook(hook)`.
+  - Mechanism: Normally, backward -> step is performed for each parameter, so all gradients need to be temporarily stored in memory. "Fuse backward and step" reduces memory usage by performing backward/step for each parameter and reflecting the gradient immediately.
+
+- Optimizer groups feature is added to SDXL training. PR [#1319](https://github.com/kohya-ss/sd-scripts/pull/1319)
+  - Memory usage is reduced by the same principle as Fused optimizer. The training results and speed are the same as Fused optimizer.
+  - Specify the number of groups like `--fused_optimizer_groups 10` in `sdxl_train.py`. Increasing the number of groups reduces memory usage but slows down training. Since the effect is limited to a certain number, it is recommended to specify 4-10.
+  - Any optimizer can be used, but optimizers that automatically calculate the learning rate (such as D-Adaptation and Prodigy) cannot be used. Gradient accumulation is not available.
+  - `--fused_optimizer_groups` cannot be used with `--fused_backward_pass`. When using AdaFactor, the memory usage is slightly larger than with Fused optimizer. PyTorch 2.1 or later is required.
+  - Mechanism: While Fused optimizer performs backward/step for individual parameters within the optimizer, optimizer groups reduce memory usage by grouping parameters and creating multiple optimizers to perform backward/step for each group. Fused optimizer requires implementation on the optimizer side, while optimizer groups are implemented only on the training script side.
+
 - Fixed some bugs when using DeepSpeed. Related [#1247]
 
+- SDXL の学習時に Fused optimizer が使えるようになりました。PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) 2kpr 氏に感謝します。
+  - optimizer の backward pass に step を統合することで学習時のメモリ使用量を大きく削減します。学習結果は未適用時と同一ですが、メモリが潤沢にある場合は速度は遅くなります。
+  - `sdxl_train.py` に `--fused_backward_pass` オプションを指定してください。現時点では optimizer は AdaFactor のみ対応しています。また gradient accumulation は使えません。
+  - mixed precision は `no` のほうが `fp16` や `bf16` よりも使用メモリ量が少ないようです。
+  - バッチサイズ 1、fp32 で 17GB 程度で学習可能なようです。`--full_bf16` オプションを指定するとさらに削減できます（精度は劣ります）。以前と同じメモリ使用量ではバッチサイズを増やせます。
+  - PyTorch 2.1 以降の新 API `Tensor.register_post_accumulate_grad_hook(hook)` を使用しているため、PyTorch 2.1 以降が必要です。
+  - 仕組み：通常は backward -> step の順で行うためすべての勾配を一時的にメモリに保持する必要があります。「backward と step の統合」はパラメータごとに backward/step を行って、勾配をすぐ反映することでメモリ使用量を削減します。
+
+- SDXL の学習時に optimizer group 機能を追加しました。PR [#1319](https://github.com/kohya-ss/sd-scripts/pull/1319)
+  - Fused optimizer と同様の原理でメモリ使用量を削減します。学習結果や速度についても同様です。
+  - `sdxl_train.py` に `--fused_optimizer_groups 10` のようにグループ数を指定してください。グループ数を増やすとメモリ使用量が削減されますが、速度は遅くなります。ある程度の数までしか効果がないため、4~10 程度を指定すると良いでしょう。
+  - 任意の optimizer が使えますが、学習率を自動計算する optimizer （D-Adaptation や Prodigy など）は使えません。gradient accumulation は使えません。
+  - `--fused_optimizer_groups` は `--fused_backward_pass` と併用できません。AdaFactor 使用時は Fused optimizer よりも若干メモリ使用量は大きくなります。PyTorch 2.1 以降が必要です。
+  - 仕組み：Fused optimizer が optimizer 内で個別のパラメータについて backward/step を行っているのに対して、optimizer groups はパラメータをグループ化して複数の optimizer を作成し、それぞれ backward/step を行うことでメモリ使用量を削減します。Fused optimizer は optimizer 側の実装が必要ですが、optimizer groups は学習スクリプト側のみで実装されています。
 
 - DeepSpeed 使用時のいくつかのバグを修正しました。関連 [#1247]
 

From bee8cee7e8fbeecc05b1c80a1e9e8fadab3210a5 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 12 May 2024 15:08:52 +0900
Subject: [PATCH 106/132] update README for fused optimizer

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4fd97fb25..9c7ecad99 100644
--- a/README.md
+++ b/README.md
@@ -145,7 +145,7 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - Setting mixed precision to `no` seems to use less memory than `fp16` or `bf16`.
   - Training is possible with a memory usage of about 17GB with a batch size of 1 and fp32. If you specify the `--full_bf16` option, you can further reduce the memory usage (but the accuracy will be lower). With the same memory usage as before, you can increase the batch size.
   - PyTorch 2.1 or later is required because it uses the new API `Tensor.register_post_accumulate_grad_hook(hook)`.
-  - Mechanism: Normally, backward -> step is performed for each parameter, so all gradients need to be temporarily stored in memory. "Fuse backward and step" reduces memory usage by performing backward/step for each parameter and reflecting the gradient immediately.
+  - Mechanism: Normally, backward -> step is performed for each parameter, so all gradients need to be temporarily stored in memory. "Fuse backward and step" reduces memory usage by performing backward/step for each parameter and reflecting the gradient immediately. The more parameters there are, the greater the effect, so it is not effective in other training scripts (LoRA, etc.) where the memory usage peak is elsewhere, and there are no plans to implement it in those training scripts.
 
 - Optimizer groups feature is added to SDXL training. PR [#1319](https://github.com/kohya-ss/sd-scripts/pull/1319)
   - Memory usage is reduced by the same principle as Fused optimizer. The training results and speed are the same as Fused optimizer.
@@ -162,14 +162,14 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - mixed precision は `no` のほうが `fp16` や `bf16` よりも使用メモリ量が少ないようです。
   - バッチサイズ 1、fp32 で 17GB 程度で学習可能なようです。`--full_bf16` オプションを指定するとさらに削減できます（精度は劣ります）。以前と同じメモリ使用量ではバッチサイズを増やせます。
   - PyTorch 2.1 以降の新 API `Tensor.register_post_accumulate_grad_hook(hook)` を使用しているため、PyTorch 2.1 以降が必要です。
-  - 仕組み：通常は backward -> step の順で行うためすべての勾配を一時的にメモリに保持する必要があります。「backward と step の統合」はパラメータごとに backward/step を行って、勾配をすぐ反映することでメモリ使用量を削減します。
+  - 仕組み：通常は backward -> step の順で行うためすべての勾配を一時的にメモリに保持する必要があります。「backward と step の統合」はパラメータごとに backward/step を行って、勾配をすぐ反映することでメモリ使用量を削減します。パラメータ数が多いほど効果が大きいため、SDXL の学習以外（LoRA 等）ではほぼ効果がなく（メモリ使用量のピークが他の場所にあるため）、それらの学習スクリプトへの実装予定もありません。
 
 - SDXL の学習時に optimizer group 機能を追加しました。PR [#1319](https://github.com/kohya-ss/sd-scripts/pull/1319)
   - Fused optimizer と同様の原理でメモリ使用量を削減します。学習結果や速度についても同様です。
   - `sdxl_train.py` に `--fused_optimizer_groups 10` のようにグループ数を指定してください。グループ数を増やすとメモリ使用量が削減されますが、速度は遅くなります。ある程度の数までしか効果がないため、4~10 程度を指定すると良いでしょう。
   - 任意の optimizer が使えますが、学習率を自動計算する optimizer （D-Adaptation や Prodigy など）は使えません。gradient accumulation は使えません。
   - `--fused_optimizer_groups` は `--fused_backward_pass` と併用できません。AdaFactor 使用時は Fused optimizer よりも若干メモリ使用量は大きくなります。PyTorch 2.1 以降が必要です。
-  - 仕組み：Fused optimizer が optimizer 内で個別のパラメータについて backward/step を行っているのに対して、optimizer groups はパラメータをグループ化して複数の optimizer を作成し、それぞれ backward/step を行うことでメモリ使用量を削減します。Fused optimizer は optimizer 側の実装が必要ですが、optimizer groups は学習スクリプト側のみで実装されています。
+  - 仕組み：Fused optimizer が optimizer 内で個別のパラメータについて backward/step を行っているのに対して、optimizer groups はパラメータをグループ化して複数の optimizer を作成し、それぞれ backward/step を行うことでメモリ使用量を削減します。Fused optimizer は optimizer 側の実装が必要ですが、optimizer groups は学習スクリプト側のみで実装されています。やはり SDXL の学習でのみ効果があります。
 
 - DeepSpeed 使用時のいくつかのバグを修正しました。関連 [#1247]
 

From 1ffc0b330aa362a408e46e9a52784d72aa73d263 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 12 May 2024 16:18:43 +0900
Subject: [PATCH 107/132] fix typo

---
 library/train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index e3c0229a7..b2de8a216 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3093,7 +3093,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
     )
     parser.add_argument("--seed", type=int, default=None, help="random seed for training / 学習時の乱数のseed")
     parser.add_argument(
-        "--gradient_checkpointing", action="store_true", help="enable gradient checkpointing / grandient checkpointingを有効にする"
+        "--gradient_checkpointing", action="store_true", help="enable gradient checkpointing / gradient checkpointingを有効にする"
     )
     parser.add_argument(
         "--gradient_accumulation_steps",

From 3c8193f64269fff68d16c1f38dedfde8715f70bb Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 12 May 2024 17:00:51 +0900
Subject: [PATCH 108/132] revert lora+ for lora_fa

---
 networks/lora_fa.py | 104 +++++++++++---------------------------------
 1 file changed, 25 insertions(+), 79 deletions(-)

diff --git a/networks/lora_fa.py b/networks/lora_fa.py
index 58bcb2206..919222ce8 100644
--- a/networks/lora_fa.py
+++ b/networks/lora_fa.py
@@ -15,10 +15,8 @@
 import torch
 import re
 from library.utils import setup_logging
-
 setup_logging()
 import logging
-
 logger = logging.getLogger(__name__)
 
 RE_UPDOWN = re.compile(r"(up|down)_blocks_(\d+)_(resnets|upsamplers|downsamplers|attentions)_(\d+)_")
@@ -506,15 +504,6 @@ def create_network(
     if up_lr_weight is not None or mid_lr_weight is not None or down_lr_weight is not None:
         network.set_block_lr_weight(up_lr_weight, mid_lr_weight, down_lr_weight)
 
-    loraplus_lr_ratio = kwargs.get("loraplus_lr_ratio", None)
-    loraplus_unet_lr_ratio = kwargs.get("loraplus_unet_lr_ratio", None)
-    loraplus_text_encoder_lr_ratio = kwargs.get("loraplus_text_encoder_lr_ratio", None)
-    loraplus_lr_ratio = float(loraplus_lr_ratio) if loraplus_lr_ratio is not None else None
-    loraplus_unet_lr_ratio = float(loraplus_unet_lr_ratio) if loraplus_unet_lr_ratio is not None else None
-    loraplus_text_encoder_lr_ratio = float(loraplus_text_encoder_lr_ratio) if loraplus_text_encoder_lr_ratio is not None else None
-    if loraplus_lr_ratio is not None or loraplus_unet_lr_ratio is not None or loraplus_text_encoder_lr_ratio is not None:
-        network.set_loraplus_lr_ratio(loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio)
-
     return network
 
 
@@ -540,9 +529,7 @@ def parse_floats(s):
             len(block_dims) == num_total_blocks
         ), f"block_dims must have {num_total_blocks} elements / block_dimsは{num_total_blocks}個指定してください"
     else:
-        logger.warning(
-            f"block_dims is not specified. all dims are set to {network_dim} / block_dimsが指定されていません。すべてのdimは{network_dim}になります"
-        )
+        logger.warning(f"block_dims is not specified. all dims are set to {network_dim} / block_dimsが指定されていません。すべてのdimは{network_dim}になります")
         block_dims = [network_dim] * num_total_blocks
 
     if block_alphas is not None:
@@ -816,17 +803,11 @@ def __init__(
         self.rank_dropout = rank_dropout
         self.module_dropout = module_dropout
 
-        self.loraplus_lr_ratio = None
-        self.loraplus_unet_lr_ratio = None
-        self.loraplus_text_encoder_lr_ratio = None
-
         if modules_dim is not None:
             logger.info(f"create LoRA network from weights")
         elif block_dims is not None:
             logger.info(f"create LoRA network from block_dims")
-            logger.info(
-                f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
-            )
+            logger.info(f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}")
             logger.info(f"block_dims: {block_dims}")
             logger.info(f"block_alphas: {block_alphas}")
             if conv_block_dims is not None:
@@ -834,13 +815,9 @@ def __init__(
                 logger.info(f"conv_block_alphas: {conv_block_alphas}")
         else:
             logger.info(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
-            logger.info(
-                f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}"
-            )
+            logger.info(f"neuron dropout: p={self.dropout}, rank dropout: p={self.rank_dropout}, module dropout: p={self.module_dropout}")
             if self.conv_lora_dim is not None:
-                logger.info(
-                    f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}"
-                )
+                logger.info(f"apply LoRA to Conv2d with kernel size (3,3). dim (rank): {self.conv_lora_dim}, alpha: {self.conv_alpha}")
 
         # create module instances
         def create_modules(
@@ -962,11 +939,6 @@ def create_modules(
             assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
             names.add(lora.lora_name)
 
-    def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, loraplus_text_encoder_lr_ratio):
-        self.loraplus_lr_ratio = loraplus_lr_ratio
-        self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
-        self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
-
     def set_multiplier(self, multiplier):
         self.multiplier = multiplier
         for lora in self.text_encoder_loras + self.unet_loras:
@@ -1065,42 +1037,18 @@ def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
         self.requires_grad_(True)
         all_params = []
 
-        def assemble_params(loras, lr, ratio):
-            param_groups = {"lora": {}, "plus": {}}
-            for lora in loras:
-                for name, param in lora.named_parameters():
-                    if ratio is not None and "lora_up" in name:
-                        param_groups["plus"][f"{lora.lora_name}.{name}"] = param
-                    else:
-                        param_groups["lora"][f"{lora.lora_name}.{name}"] = param
-
+        def enumerate_params(loras: List[LoRAModule]):
             params = []
-            for key in param_groups.keys():
-                param_data = {"params": param_groups[key].values()}
-
-                if len(param_data["params"]) == 0:
-                    continue
-
-                if lr is not None:
-                    if key == "plus":
-                        param_data["lr"] = lr * ratio
-                    else:
-                        param_data["lr"] = lr
-
-                if param_data.get("lr", None) == 0 or param_data.get("lr", None) is None:
-                    continue
-
-                params.append(param_data)
-
+            for lora in loras:
+                # params.extend(lora.parameters())
+                params.extend(lora.get_trainable_params())
             return params
 
         if self.text_encoder_loras:
-            params = assemble_params(
-                self.text_encoder_loras,
-                text_encoder_lr if text_encoder_lr is not None else default_lr,
-                self.loraplus_text_encoder_lr_ratio or self.loraplus_ratio,
-            )
-            all_params.extend(params)
+            param_data = {"params": enumerate_params(self.text_encoder_loras)}
+            if text_encoder_lr is not None:
+                param_data["lr"] = text_encoder_lr
+            all_params.append(param_data)
 
         if self.unet_loras:
             if self.block_lr:
@@ -1114,20 +1062,21 @@ def assemble_params(loras, lr, ratio):
 
                 # blockごとにパラメータを設定する
                 for idx, block_loras in block_idx_to_lora.items():
-                    params = assemble_params(
-                        block_loras,
-                        (unet_lr if unet_lr is not None else default_lr) * self.get_lr_weight(block_loras[0]),
-                        self.loraplus_unet_lr_ratio or self.loraplus_ratio,
-                    )
-                    all_params.extend(params)
+                    param_data = {"params": enumerate_params(block_loras)}
+
+                    if unet_lr is not None:
+                        param_data["lr"] = unet_lr * self.get_lr_weight(block_loras[0])
+                    elif default_lr is not None:
+                        param_data["lr"] = default_lr * self.get_lr_weight(block_loras[0])
+                    if ("lr" in param_data) and (param_data["lr"] == 0):
+                        continue
+                    all_params.append(param_data)
 
             else:
-                params = assemble_params(
-                    self.unet_loras,
-                    unet_lr if unet_lr is not None else default_lr,
-                    self.loraplus_unet_lr_ratio or self.loraplus_ratio,
-                )
-                all_params.extend(params)
+                param_data = {"params": enumerate_params(self.unet_loras)}
+                if unet_lr is not None:
+                    param_data["lr"] = unet_lr
+                all_params.append(param_data)
 
         return all_params
 
@@ -1144,9 +1093,6 @@ def on_epoch_start(self, text_encoder, unet):
     def get_trainable_params(self):
         return self.parameters()
 
-    def get_trainable_named_params(self):
-        return self.named_parameters()
-
     def save_weights(self, file, dtype, metadata):
         if metadata is not None and len(metadata) == 0:
             metadata = None

From 44190416c6389d9ae9ffb18c28744be1259fc02c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 12 May 2024 17:01:20 +0900
Subject: [PATCH 109/132] update docs etc.

---
 README.md                       | 26 ++++++++++++++++++++++++--
 docs/train_network_README-ja.md | 11 +++++++----
 networks/lora.py                |  2 +-
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 9c7ecad99..b10da0f23 100644
--- a/README.md
+++ b/README.md
@@ -154,7 +154,18 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - `--fused_optimizer_groups` cannot be used with `--fused_backward_pass`. When using AdaFactor, the memory usage is slightly larger than with Fused optimizer. PyTorch 2.1 or later is required.
   - Mechanism: While Fused optimizer performs backward/step for individual parameters within the optimizer, optimizer groups reduce memory usage by grouping parameters and creating multiple optimizers to perform backward/step for each group. Fused optimizer requires implementation on the optimizer side, while optimizer groups are implemented only on the training script side.
 
-- Fixed some bugs when using DeepSpeed. Related [#1247]
+- LoRA+ is supported. PR [#1233](https://github.com/kohya-ss/sd-scripts/pull/1233) Thanks to rockerBOO!
+  - LoRA+ is a method to improve training speed by increasing the learning rate of the UP side (LoRA-B) of LoRA. Specify the multiple. The original paper recommends 16, but adjust as needed. Please see the PR for details.
+  - Specify `loraplus_lr_ratio` with `--network_args`. Example: `--network_args "loraplus_lr_ratio=16"`
+  - `loraplus_unet_lr_ratio` and `loraplus_lr_ratio` can be specified separately for U-Net and Text Encoder.
+    - Example: `--network_args "loraplus_unet_lr_ratio=16" "loraplus_text_encoder_lr_ratio=4"` or `--network_args "loraplus_lr_ratio=16" "loraplus_text_encoder_lr_ratio=4"` etc.
+  - `network_module` `networks.lora` and `networks.dylora` are available.
+
+- LoRA training in SDXL now supports block-wise learning rates and block-wise dim (rank). PR [#1331](https://github.com/kohya-ss/sd-scripts/pull/1331) 
+  - Specify the learning rate and dim (rank) for each block.
+  - See [Block-wise learning rates in LoRA](./docs/train_network_README-ja.md#階層別学習率) for details (Japanese only).
+
+- Fixed some bugs when using DeepSpeed. Related [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
 - SDXL の学習時に Fused optimizer が使えるようになりました。PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) 2kpr 氏に感謝します。
   - optimizer の backward pass に step を統合することで学習時のメモリ使用量を大きく削減します。学習結果は未適用時と同一ですが、メモリが潤沢にある場合は速度は遅くなります。
@@ -171,7 +182,18 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - `--fused_optimizer_groups` は `--fused_backward_pass` と併用できません。AdaFactor 使用時は Fused optimizer よりも若干メモリ使用量は大きくなります。PyTorch 2.1 以降が必要です。
   - 仕組み：Fused optimizer が optimizer 内で個別のパラメータについて backward/step を行っているのに対して、optimizer groups はパラメータをグループ化して複数の optimizer を作成し、それぞれ backward/step を行うことでメモリ使用量を削減します。Fused optimizer は optimizer 側の実装が必要ですが、optimizer groups は学習スクリプト側のみで実装されています。やはり SDXL の学習でのみ効果があります。
 
-- DeepSpeed 使用時のいくつかのバグを修正しました。関連 [#1247]
+- LoRA+ がサポートされました。PR [#1233](https://github.com/kohya-ss/sd-scripts/pull/1233) rockerBOO 氏に感謝します。
+  - LoRA の UP 側（LoRA-B）の学習率を上げることで学習速度の向上を図る手法です。倍数で指定します。元の論文では 16 が推奨されていますが、データセット等にもよりますので、適宜調整してください。PR もあわせてご覧ください。
+  - `--network_args` で `loraplus_lr_ratio` を指定します。例：`--network_args "loraplus_lr_ratio=16"`
+  - `loraplus_unet_lr_ratio` と `loraplus_lr_ratio` で、U-Net および Text Encoder に個別の値を指定することも可能です。
+    - 例：`--network_args "loraplus_unet_lr_ratio=16" "loraplus_text_encoder_lr_ratio=4"` または `--network_args "loraplus_lr_ratio=16" "loraplus_text_encoder_lr_ratio=4"` など
+  - `network_module` の `networks.lora` および `networks.dylora` で使用可能です。
+
+- SDXL の LoRA で階層別学習率、階層別 dim (rank) をサポートしました。PR [#1331](https://github.com/kohya-ss/sd-scripts/pull/1331) 
+  - ブロックごとに学習率および dim (rank) を指定することができます。
+  - 詳細は [LoRA の階層別学習率](./docs/train_network_README-ja.md#階層別学習率) をご覧ください。
+
+- DeepSpeed 使用時のいくつかのバグを修正しました。関連 [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
 
 ### Apr 7, 2024 / 2024-04-07: v0.8.7
diff --git a/docs/train_network_README-ja.md b/docs/train_network_README-ja.md
index 2205a7736..46085117c 100644
--- a/docs/train_network_README-ja.md
+++ b/docs/train_network_README-ja.md
@@ -181,16 +181,16 @@ python networks\extract_lora_from_dylora.py --model "foldername/dylora-model.saf
 
 詳細は[PR #355](https://github.com/kohya-ss/sd-scripts/pull/355) をご覧ください。
 
-SDXLは現在サポートしていません。
-
 フルモデルの25個のブロックの重みを指定できます。最初のブロックに該当するLoRAは存在しませんが、階層別LoRA適用等との互換性のために25個としています。またconv2d3x3に拡張しない場合も一部のブロックにはLoRAが存在しませんが、記述を統一するため常に25個の値を指定してください。
 
+SDXL では down/up 9 個、middle 3 個の値を指定してください。
+
 `--network_args` で以下の引数を指定してください。
 
 - `down_lr_weight` : U-Netのdown blocksの学習率の重みを指定します。以下が指定可能です。
-  - ブロックごとの重み : `"down_lr_weight=0,0,0,0,0,0,1,1,1,1,1,1"` のように12個の数値を指定します。
+  - ブロックごとの重み : `"down_lr_weight=0,0,0,0,0,0,1,1,1,1,1,1"` のように12個（SDXL では 9 個）の数値を指定します。
   - プリセットからの指定 : `"down_lr_weight=sine"` のように指定します（サインカーブで重みを指定します）。sine, cosine, linear, reverse_linear, zeros が指定可能です。また `"down_lr_weight=cosine+.25"` のように `+数値` を追加すると、指定した数値を加算します（0.25~1.25になります）。
-- `mid_lr_weight` : U-Netのmid blockの学習率の重みを指定します。`"down_lr_weight=0.5"` のように数値を一つだけ指定します。
+- `mid_lr_weight` : U-Netのmid blockの学習率の重みを指定します。`"down_lr_weight=0.5"` のように数値を一つだけ指定します（SDXL の場合は 3 個）。
 - `up_lr_weight` : U-Netのup blocksの学習率の重みを指定します。down_lr_weightと同様です。
 - 指定を省略した部分は1.0として扱われます。また重みを0にするとそのブロックのLoRAモジュールは作成されません。
 - `block_lr_zero_threshold` : 重みがこの値以下の場合、LoRAモジュールを作成しません。デフォルトは0です。
@@ -215,6 +215,9 @@ network_args = [ "block_lr_zero_threshold=0.1", "down_lr_weight=sine+.5", "mid_l
 
 フルモデルの25個のブロックのdim (rank)を指定できます。階層別学習率と同様に一部のブロックにはLoRAが存在しない場合がありますが、常に25個の値を指定してください。
 
+SDXL では 23 個の値を指定してください。一部のブロックにはLoRA が存在しませんが、`sdxl_train.py` の[階層別学習率](./train_SDXL-en.md) との互換性のためです。
+対応は、`0: time/label embed, 1-9: input blocks 0-8, 10-12: mid blocks 0-2, 13-21: output blocks 0-8, 22: out` です。
+
 `--network_args` で以下の引数を指定してください。
 
 - `block_dims` : 各ブロックのdim (rank)を指定します。`"block_dims=2,2,2,2,4,4,4,4,6,6,6,6,8,6,6,6,6,4,4,4,4,2,2,2,2"` のように25個の数値を指定します。
diff --git a/networks/lora.py b/networks/lora.py
index 6e5645577..00d21b0ed 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -985,7 +985,7 @@ def create_modules(
 
         skipped = skipped_te + skipped_un
         if varbose and len(skipped) > 0:
-            logger.warn(
+            logger.warning(
                 f"because block_lr_weight is 0 or dim (rank) is 0, {len(skipped)} LoRA modules are skipped / block_lr_weightまたはdim (rank)が0の為、次の{len(skipped)}個のLoRAモジュールはスキップされます:"
             )
             for name in skipped:

From 9ddb4d7a0138722913f6f1a6f1bf30f7ff89bb5b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 12 May 2024 17:55:08 +0900
Subject: [PATCH 110/132] update readme and help message etc.

---
 README.md                  | 8 ++++++++
 library/sdxl_model_util.py | 6 ++++--
 library/sdxl_train_util.py | 6 +++++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index b10da0f23..ed91d6d7b 100644
--- a/README.md
+++ b/README.md
@@ -165,6 +165,10 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - Specify the learning rate and dim (rank) for each block.
   - See [Block-wise learning rates in LoRA](./docs/train_network_README-ja.md#階層別学習率) for details (Japanese only).
 
+- An option `--disable_mmap_load_safetensors` is added to disable memory mapping when loading the model's .safetensors in SDXL. PR [#1266](https://github.com/kohya-ss/sd-scripts/pull/1266) Thanks to Zovjsra!
+  - It seems that the model file loading is faster in the WSL environment etc.
+  - Available in `sdxl_train.py`, `sdxl_train_network.py`, `sdxl_train_textual_inversion.py`, and `sdxl_train_control_net_lllite.py`.
+
 - Fixed some bugs when using DeepSpeed. Related [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
 - SDXL の学習時に Fused optimizer が使えるようになりました。PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) 2kpr 氏に感謝します。
@@ -193,6 +197,10 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - ブロックごとに学習率および dim (rank) を指定することができます。
   - 詳細は [LoRA の階層別学習率](./docs/train_network_README-ja.md#階層別学習率) をご覧ください。
 
+- SDXL でモデルの .safetensors を読み込む際にメモリマッピングを無効化するオプション `--disable_mmap_load_safetensors` が追加されました。PR [#1266](https://github.com/kohya-ss/sd-scripts/pull/1266) Zovjsra 氏に感謝します。
+  - WSL 環境等でモデルファイルの読み込みが高速化されるようです。
+  - `sdxl_train.py`、`sdxl_train_network.py`、`sdxl_train_textual_inversion.py`、`sdxl_train_control_net_lllite.py` で使用可能です。
+
 - DeepSpeed 使用時のいくつかのバグを修正しました。関連 [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
 
diff --git a/library/sdxl_model_util.py b/library/sdxl_model_util.py
index e6fcb1f9c..4fad78a1c 100644
--- a/library/sdxl_model_util.py
+++ b/library/sdxl_model_util.py
@@ -9,8 +9,10 @@
 from library import model_util
 from library import sdxl_original_unet
 from .utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 VAE_SCALE_FACTOR = 0.13025
@@ -171,8 +173,8 @@ def load_models_from_sdxl_checkpoint(model_version, ckpt_path, map_location, dty
     # Load the state dict
     if model_util.is_safetensors(ckpt_path):
         checkpoint = None
-        if(disable_mmap):
-            state_dict = safetensors.torch.load(open(ckpt_path, 'rb').read())
+        if disable_mmap:
+            state_dict = safetensors.torch.load(open(ckpt_path, "rb").read())
         else:
             try:
                 state_dict = load_file(ckpt_path, device=map_location)
diff --git a/library/sdxl_train_util.py b/library/sdxl_train_util.py
index 106c5b455..b74bea91a 100644
--- a/library/sdxl_train_util.py
+++ b/library/sdxl_train_util.py
@@ -5,6 +5,7 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from accelerate import init_empty_weights
@@ -13,8 +14,10 @@
 from library import model_util, sdxl_model_util, train_util, sdxl_original_unet
 from library.sdxl_lpw_stable_diffusion import SdxlStableDiffusionLongPromptWeightingPipeline
 from .utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 TOKENIZER1_PATH = "openai/clip-vit-large-patch14"
@@ -44,7 +47,7 @@ def load_target_model(args, accelerator, model_version: str, weight_dtype):
                 weight_dtype,
                 accelerator.device if args.lowram else "cpu",
                 model_dtype,
-                args.disable_mmap_load_safetensors
+                args.disable_mmap_load_safetensors,
             )
 
             # work on low-ram device
@@ -336,6 +339,7 @@ def add_sdxl_training_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--disable_mmap_load_safetensors",
         action="store_true",
+        help="disable mmap load for safetensors. Speed up model loading in WSL environment / safetensorsのmmapロードを無効にする。WSL環境等でモデル読み込みを高速化できる",
     )
 
 

From 3701507874c920e09e402980363702a91a67da3d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 12 May 2024 20:56:56 +0900
Subject: [PATCH 111/132] raise original error if error is occured in checking
 latents

---
 library/train_util.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index d157cdbcd..8a69f0bef 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2136,9 +2136,8 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool):
             if npz["latents_flipped"].shape[1:3] != expected_latents_size:
                 return False
     except Exception as e:
-        print(npz_path)
-        print(e)
-        return False
+        logger.error(f"Error loading file: {npz_path}")
+        raise e
 
     return True
 

From 39b82f26e5f9df6518a4e32f4b91b4c46cc667fb Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 12 May 2024 20:58:45 +0900
Subject: [PATCH 112/132] update readme

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index ed91d6d7b..245853415 100644
--- a/README.md
+++ b/README.md
@@ -169,6 +169,8 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - It seems that the model file loading is faster in the WSL environment etc.
   - Available in `sdxl_train.py`, `sdxl_train_network.py`, `sdxl_train_textual_inversion.py`, and `sdxl_train_control_net_lllite.py`.
 
+- When there is an error in the cached latents file on disk, the file name is now displayed. PR [#1278](https://github.com/kohya-ss/sd-scripts/pull/1278) Thanks to Cauldrath!
+
 - Fixed some bugs when using DeepSpeed. Related [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
 - SDXL の学習時に Fused optimizer が使えるようになりました。PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) 2kpr 氏に感謝します。
@@ -201,6 +203,8 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - WSL 環境等でモデルファイルの読み込みが高速化されるようです。
   - `sdxl_train.py`、`sdxl_train_network.py`、`sdxl_train_textual_inversion.py`、`sdxl_train_control_net_lllite.py` で使用可能です。
 
+- ディスクにキャッシュされた latents ファイルに何らかのエラーがあったとき、そのファイル名が表示されるようになりました。 PR [#1278](https://github.com/kohya-ss/sd-scripts/pull/1278) Cauldrath 氏に感謝します。
+
 - DeepSpeed 使用時のいくつかのバグを修正しました。関連 [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
 

From 16677da0d90ad9094a0301990b831a8dd6c0e957 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 12 May 2024 22:15:07 +0900
Subject: [PATCH 113/132] fix create_network_from_weights doesn't work

---
 networks/lora.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/networks/lora.py b/networks/lora.py
index 00d21b0ed..79dc6ec07 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -757,6 +757,9 @@ def get_block_index(lora_name: str, is_sdxl: bool = False) -> int:
 
 # Create network from weights for inference, weights are not loaded here (because can be merged)
 def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weights_sd=None, for_inference=False, **kwargs):
+    # if unet is an instance of SdxlUNet2DConditionModel or subclass, set is_sdxl to True
+    is_sdxl = unet is not None and issubclass(unet.__class__, SdxlUNet2DConditionModel)
+
     if weights_sd is None:
         if os.path.splitext(file)[1] == ".safetensors":
             from safetensors.torch import load_file, safe_open
@@ -792,7 +795,7 @@ def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weigh
     )
 
     # block lr
-    block_lr_weight = parse_block_lr_kwargs(kwargs)
+    block_lr_weight = parse_block_lr_kwargs(is_sdxl, kwargs)
     if block_lr_weight is not None:
         network.set_block_lr_weight(block_lr_weight)
 

From 589c2aa025d277497de32c2ceb8a9e76f4ca4bf2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 13 May 2024 21:20:37 +0900
Subject: [PATCH 114/132] update README

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 245853415..9d042a41b 100644
--- a/README.md
+++ b/README.md
@@ -171,6 +171,12 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 - When there is an error in the cached latents file on disk, the file name is now displayed. PR [#1278](https://github.com/kohya-ss/sd-scripts/pull/1278) Thanks to Cauldrath!
 
+- Fixed an error that occurs when specifying `--max_dataloader_n_workers` in `tag_images_by_wd14_tagger.py` when Onnx is not used. PR [#1291](
+https://github.com/kohya-ss/sd-scripts/pull/1291) issue [#1290](
+https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
+
+- Fixed a bug that `caption_separator` cannot be specified in the subset in the dataset settings .toml file.  [#1312](https://github.com/kohya-ss/sd-scripts/pull/1312) and [#1313](https://github.com/kohya-ss/sd-scripts/pull/1312) Thanks to rockerBOO!
+
 - Fixed some bugs when using DeepSpeed. Related [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
 - SDXL の学習時に Fused optimizer が使えるようになりました。PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) 2kpr 氏に感謝します。
@@ -205,6 +211,12 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 - ディスクにキャッシュされた latents ファイルに何らかのエラーがあったとき、そのファイル名が表示されるようになりました。 PR [#1278](https://github.com/kohya-ss/sd-scripts/pull/1278) Cauldrath 氏に感謝します。
 
+- `tag_images_by_wd14_tagger.py` で Onnx 未使用時に `--max_dataloader_n_workers` を指定するとエラーになる不具合が修正されました。 PR [#1291](
+https://github.com/kohya-ss/sd-scripts/pull/1291) issue [#1290](
+https://github.com/kohya-ss/sd-scripts/pull/1290) frodo821 氏に感謝します。
+
+- データセット設定の .toml ファイルで、`caption_separator` が subset に指定できない不具合が修正されました。 PR [#1312](https://github.com/kohya-ss/sd-scripts/pull/1312) および [#1313](https://github.com/kohya-ss/sd-scripts/pull/1312) rockerBOO 氏に感謝します。
+
 - DeepSpeed 使用時のいくつかのバグを修正しました。関連 [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
 

From 153764a687d7553866335554d2b35ba89a123297 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 15 May 2024 20:21:49 +0900
Subject: [PATCH 115/132] add prompt option '--f' for filename

---
 README.md  |  3 +++
 gen_img.py | 55 +++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 9d042a41b..52d801217 100644
--- a/README.md
+++ b/README.md
@@ -179,6 +179,8 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
 
 - Fixed some bugs when using DeepSpeed. Related [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
+- Added a prompt option `--f` to `gen_imgs.py` to specify the file name when saving.
+
 - SDXL の学習時に Fused optimizer が使えるようになりました。PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) 2kpr 氏に感謝します。
   - optimizer の backward pass に step を統合することで学習時のメモリ使用量を大きく削減します。学習結果は未適用時と同一ですが、メモリが潤沢にある場合は速度は遅くなります。
   - `sdxl_train.py` に `--fused_backward_pass` オプションを指定してください。現時点では optimizer は AdaFactor のみ対応しています。また gradient accumulation は使えません。
@@ -219,6 +221,7 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) frodo821 氏に感謝します
 
 - DeepSpeed 使用時のいくつかのバグを修正しました。関連 [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
+- `gen_imgs.py` のプロンプトオプションに、保存時のファイル名を指定する `--f` オプションを追加しました。
 
 ### Apr 7, 2024 / 2024-04-07: v0.8.7
 
diff --git a/gen_img.py b/gen_img.py
index 4fe898716..d0a8f8141 100644
--- a/gen_img.py
+++ b/gen_img.py
@@ -1435,6 +1435,7 @@ class BatchDataBase(NamedTuple):
     clip_prompt: str
     guide_image: Any
     raw_prompt: str
+    file_name: Optional[str]
 
 
 class BatchDataExt(NamedTuple):
@@ -2316,7 +2317,7 @@ def scale_and_round(x):
             # このバッチの情報を取り出す
             (
                 return_latents,
-                (step_first, _, _, _, init_image, mask_image, _, guide_image, _),
+                (step_first, _, _, _, init_image, mask_image, _, guide_image, _, _),
                 (
                     width,
                     height,
@@ -2339,6 +2340,7 @@ def scale_and_round(x):
             prompts = []
             negative_prompts = []
             raw_prompts = []
+            filenames = []
             start_code = torch.zeros((batch_size, *noise_shape), device=device, dtype=dtype)
             noises = [
                 torch.zeros((batch_size, *noise_shape), device=device, dtype=dtype)
@@ -2371,7 +2373,7 @@ def scale_and_round(x):
             all_guide_images_are_same = True
             for i, (
                 _,
-                (_, prompt, negative_prompt, seed, init_image, mask_image, clip_prompt, guide_image, raw_prompt),
+                (_, prompt, negative_prompt, seed, init_image, mask_image, clip_prompt, guide_image, raw_prompt, filename),
                 _,
             ) in enumerate(batch):
                 prompts.append(prompt)
@@ -2379,6 +2381,7 @@ def scale_and_round(x):
                 seeds.append(seed)
                 clip_prompts.append(clip_prompt)
                 raw_prompts.append(raw_prompt)
+                filenames.append(filename)
 
                 if init_image is not None:
                     init_images.append(init_image)
@@ -2478,8 +2481,8 @@ def scale_and_round(x):
             # save image
             highres_prefix = ("0" if highres_1st else "1") if highres_fix else ""
             ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
-            for i, (image, prompt, negative_prompts, seed, clip_prompt, raw_prompt) in enumerate(
-                zip(images, prompts, negative_prompts, seeds, clip_prompts, raw_prompts)
+            for i, (image, prompt, negative_prompts, seed, clip_prompt, raw_prompt, filename) in enumerate(
+                zip(images, prompts, negative_prompts, seeds, clip_prompts, raw_prompts, filenames)
             ):
                 if highres_fix:
                     seed -= 1  # record original seed
@@ -2505,17 +2508,23 @@ def scale_and_round(x):
                     metadata.add_text("crop-top", str(crop_top))
                     metadata.add_text("crop-left", str(crop_left))
 
-                if args.use_original_file_name and init_images is not None:
-                    if type(init_images) is list:
-                        fln = os.path.splitext(os.path.basename(init_images[i % len(init_images)].filename))[0] + ".png"
-                    else:
-                        fln = os.path.splitext(os.path.basename(init_images.filename))[0] + ".png"
-                elif args.sequential_file_name:
-                    fln = f"im_{highres_prefix}{step_first + i + 1:06d}.png"
+                if filename is not None:
+                    fln = filename
                 else:
-                    fln = f"im_{ts_str}_{highres_prefix}{i:03d}_{seed}.png"
+                    if args.use_original_file_name and init_images is not None:
+                        if type(init_images) is list:
+                            fln = os.path.splitext(os.path.basename(init_images[i % len(init_images)].filename))[0] + ".png"
+                        else:
+                            fln = os.path.splitext(os.path.basename(init_images.filename))[0] + ".png"
+                    elif args.sequential_file_name:
+                        fln = f"im_{highres_prefix}{step_first + i + 1:06d}.png"
+                    else:
+                        fln = f"im_{ts_str}_{highres_prefix}{i:03d}_{seed}.png"
 
-                image.save(os.path.join(args.outdir, fln), pnginfo=metadata)
+                if fln.endswith(".webp"):
+                    image.save(os.path.join(args.outdir, fln), pnginfo=metadata, quality=100)  # lossy
+                else:
+                    image.save(os.path.join(args.outdir, fln), pnginfo=metadata)
 
             if not args.no_preview and not highres_1st and args.interactive:
                 try:
@@ -2562,6 +2571,7 @@ def scale_and_round(x):
             # repeat prompt
             for pi in range(args.images_per_prompt if len(raw_prompts) == 1 else len(raw_prompts)):
                 raw_prompt = raw_prompts[pi] if len(raw_prompts) > 1 else raw_prompts[0]
+                filename = None
 
                 if pi == 0 or len(raw_prompts) > 1:
                     # parse prompt: if prompt is not changed, skip parsing
@@ -2783,6 +2793,12 @@ def scale_and_round(x):
                                 logger.info(f"gradual latent unsharp params: {gl_unsharp_params}")
                                 continue
 
+                            m = re.match(r"f (.+)", parg, re.IGNORECASE)
+                            if m:  # filename
+                                filename = m.group(1)
+                                logger.info(f"filename: {filename}")
+                                continue
+
                         except ValueError as ex:
                             logger.error(f"Exception in parsing / 解析エラー: {parg}")
                             logger.error(f"{ex}")
@@ -2873,7 +2889,16 @@ def scale_and_round(x):
                 b1 = BatchData(
                     False,
                     BatchDataBase(
-                        global_step, prompt, negative_prompt, seed, init_image, mask_image, clip_prompt, guide_image, raw_prompt
+                        global_step,
+                        prompt,
+                        negative_prompt,
+                        seed,
+                        init_image,
+                        mask_image,
+                        clip_prompt,
+                        guide_image,
+                        raw_prompt,
+                        filename,
                     ),
                     BatchDataExt(
                         width,
@@ -2916,7 +2941,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser()
 
     add_logging_arguments(parser)
-    
+
     parser.add_argument(
         "--sdxl", action="store_true", help="load Stable Diffusion XL model / Stable Diffusion XLのモデルを読み込む"
     )

From 146edce6934beee050d8e73458dad794449a0ff4 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 18 May 2024 11:05:04 +0900
Subject: [PATCH 116/132] support Diffusers' based SDXL LoRA key for inference

---
 networks/lora.py | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/networks/lora.py b/networks/lora.py
index 79dc6ec07..9f159f5db 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -755,6 +755,52 @@ def get_block_index(lora_name: str, is_sdxl: bool = False) -> int:
     return block_idx
 
 
+def convert_diffusers_to_sai_if_needed(weights_sd):
+    # only supports U-Net LoRA modules
+
+    found_up_down_blocks = False
+    for k in list(weights_sd.keys()):
+        if "down_blocks" in k:
+            found_up_down_blocks = True
+            break
+        if "up_blocks" in k:
+            found_up_down_blocks = True
+            break
+    if not found_up_down_blocks:
+        return
+
+    from library.sdxl_model_util import make_unet_conversion_map
+
+    unet_conversion_map = make_unet_conversion_map()
+    unet_conversion_map = {hf.replace(".", "_")[:-1]: sd.replace(".", "_")[:-1] for sd, hf in unet_conversion_map}
+
+    # # add extra conversion
+    # unet_conversion_map["up_blocks_1_upsamplers_0"] = "lora_unet_output_blocks_2_2_conv"
+
+    logger.info(f"Converting LoRA keys from Diffusers to SAI")
+    lora_unet_prefix = "lora_unet_"
+    for k in list(weights_sd.keys()):
+        if not k.startswith(lora_unet_prefix):
+            continue
+
+        unet_module_name = k[len(lora_unet_prefix) :].split(".")[0]
+
+        # search for conversion: this is slow because the algorithm is O(n^2), but the number of keys is small
+        for hf_module_name, sd_module_name in unet_conversion_map.items():
+            if hf_module_name in unet_module_name:
+                new_key = (
+                    lora_unet_prefix
+                    + unet_module_name.replace(hf_module_name, sd_module_name)
+                    + k[len(lora_unet_prefix) + len(unet_module_name) :]
+                )
+                weights_sd[new_key] = weights_sd.pop(k)
+                found = True
+                break
+
+        if not found:
+            logger.warning(f"Key {k} is not found in unet_conversion_map")
+
+
 # Create network from weights for inference, weights are not loaded here (because can be merged)
 def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weights_sd=None, for_inference=False, **kwargs):
     # if unet is an instance of SdxlUNet2DConditionModel or subclass, set is_sdxl to True
@@ -768,6 +814,9 @@ def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weigh
         else:
             weights_sd = torch.load(file, map_location="cpu")
 
+    # if keys are Diffusers based, convert to SAI based
+    convert_diffusers_to_sai_if_needed(weights_sd)
+
     # get dim/alpha mapping
     modules_dim = {}
     modules_alpha = {}

From 2f19175dfeb98e5ad93a633c79fa846d67210844 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 19 May 2024 15:38:37 +0900
Subject: [PATCH 117/132] update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 52d801217..b9852e0ad 100644
--- a/README.md
+++ b/README.md
@@ -179,7 +179,7 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
 
 - Fixed some bugs when using DeepSpeed. Related [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
-- Added a prompt option `--f` to `gen_imgs.py` to specify the file name when saving.
+- Added a prompt option `--f` to `gen_imgs.py` to specify the file name when saving. Also, Diffusers-based keys for LoRA weights are now supported.
 
 - SDXL の学習時に Fused optimizer が使えるようになりました。PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) 2kpr 氏に感謝します。
   - optimizer の backward pass に step を統合することで学習時のメモリ使用量を大きく削減します。学習結果は未適用時と同一ですが、メモリが潤沢にある場合は速度は遅くなります。
@@ -221,7 +221,7 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) frodo821 氏に感謝します
 
 - DeepSpeed 使用時のいくつかのバグを修正しました。関連 [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
-- `gen_imgs.py` のプロンプトオプションに、保存時のファイル名を指定する `--f` オプションを追加しました。
+- `gen_imgs.py` のプロンプトオプションに、保存時のファイル名を指定する `--f` オプションを追加しました。また同スクリプトで Diffusers ベースのキーを持つ LoRA の重みに対応しました。
 
 ### Apr 7, 2024 / 2024-04-07: v0.8.7
 

From e3ddd1fbbe4e00f49649f5aabd470b9dccf3019d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 19 May 2024 16:26:10 +0900
Subject: [PATCH 118/132] update README and format code

---
 README.md                        | 4 ++++
 sdxl_train_control_net_lllite.py | 9 +++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b9852e0ad..5d035eb6f 100644
--- a/README.md
+++ b/README.md
@@ -177,6 +177,8 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
 
 - Fixed a bug that `caption_separator` cannot be specified in the subset in the dataset settings .toml file.  [#1312](https://github.com/kohya-ss/sd-scripts/pull/1312) and [#1313](https://github.com/kohya-ss/sd-scripts/pull/1312) Thanks to rockerBOO!
 
+- Fixed a potential bug in ControlNet-LLLite training. PR [#1322](https://github.com/kohya-ss/sd-scripts/pull/1322) Thanks to aria1th!
+
 - Fixed some bugs when using DeepSpeed. Related [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
 - Added a prompt option `--f` to `gen_imgs.py` to specify the file name when saving. Also, Diffusers-based keys for LoRA weights are now supported.
@@ -219,6 +221,8 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) frodo821 氏に感謝します
 
 - データセット設定の .toml ファイルで、`caption_separator` が subset に指定できない不具合が修正されました。 PR [#1312](https://github.com/kohya-ss/sd-scripts/pull/1312) および [#1313](https://github.com/kohya-ss/sd-scripts/pull/1312) rockerBOO 氏に感謝します。
 
+- ControlNet-LLLite 学習時の潜在バグが修正されました。 PR [#1322](https://github.com/kohya-ss/sd-scripts/pull/1322) aria1th 氏に感謝します。
+
 - DeepSpeed 使用時のいくつかのバグを修正しました。関連 [#1247](https://github.com/kohya-ss/sd-scripts/pull/1247)
 
 - `gen_imgs.py` のプロンプトオプションに、保存時のファイル名を指定する `--f` オプションを追加しました。また同スクリプトで Diffusers ベースのキーを持つ LoRA の重みに対応しました。
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 6ad6e763c..09b6d73be 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -15,6 +15,7 @@
 
 import torch
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -439,7 +440,9 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                    args, noise_scheduler, latents
+                )
 
                 noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
@@ -458,7 +461,9 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
+                loss = train_util.conditional_loss(
+                    noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                )
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight

From c68baae48033fe9794860518fe052dbf8def905e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 19 May 2024 17:21:04 +0900
Subject: [PATCH 119/132] add `--log_config` option to enable/disable output
 training config

---
 README.md                            |  6 ++++++
 fine_tune.py                         | 20 +++++++++++++++-----
 library/train_util.py                | 16 +++++++++++++---
 sdxl_train.py                        |  2 +-
 sdxl_train_control_net_lllite.py     |  2 +-
 sdxl_train_control_net_lllite_old.py |  2 +-
 train_controlnet.py                  |  2 +-
 train_db.py                          |  2 +-
 train_network.py                     |  2 +-
 train_textual_inversion.py           |  2 +-
 train_textual_inversion_XTI.py       |  2 +-
 11 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 5d035eb6f..cd7744598 100644
--- a/README.md
+++ b/README.md
@@ -165,6 +165,9 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - Specify the learning rate and dim (rank) for each block.
   - See [Block-wise learning rates in LoRA](./docs/train_network_README-ja.md#階層別学習率) for details (Japanese only).
 
+- Training scripts can now output training settings to wandb or Tensor Board logs. Specify the `--log_config` option. PR [#1285](https://github.com/kohya-ss/sd-scripts/pull/1285)  Thanks to ccharest93, plucked, rockerBOO, and VelocityRa!
+  - Some settings, such as API keys and directory specifications, are not output due to security issues.
+  
 - An option `--disable_mmap_load_safetensors` is added to disable memory mapping when loading the model's .safetensors in SDXL. PR [#1266](https://github.com/kohya-ss/sd-scripts/pull/1266) Thanks to Zovjsra!
   - It seems that the model file loading is faster in the WSL environment etc.
   - Available in `sdxl_train.py`, `sdxl_train_network.py`, `sdxl_train_textual_inversion.py`, and `sdxl_train_control_net_lllite.py`.
@@ -209,6 +212,9 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
   - ブロックごとに学習率および dim (rank) を指定することができます。
   - 詳細は [LoRA の階層別学習率](./docs/train_network_README-ja.md#階層別学習率) をご覧ください。
 
+- 各学習スクリプトで学習設定を wandb や Tensor Board などのログに出力できるようになりました。`--log_config` オプションを指定してください。PR [#1285](https://github.com/kohya-ss/sd-scripts/pull/1285)  ccharest93 氏、plucked 氏、rockerBOO 氏および VelocityRa 氏に感謝します。
+  - API キーや各種ディレクトリ指定など、一部の設定はセキュリティ上の問題があるため出力されません。
+
 - SDXL でモデルの .safetensors を読み込む際にメモリマッピングを無効化するオプション `--disable_mmap_load_safetensors` が追加されました。PR [#1266](https://github.com/kohya-ss/sd-scripts/pull/1266) Zovjsra 氏に感謝します。
   - WSL 環境等でモデルファイルの読み込みが高速化されるようです。
   - `sdxl_train.py`、`sdxl_train_network.py`、`sdxl_train_textual_inversion.py`、`sdxl_train_control_net_lllite.py` で使用可能です。
diff --git a/fine_tune.py b/fine_tune.py
index 77a1a4f30..d865cd2de 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -310,7 +310,11 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             init_kwargs["wandb"] = {"name": args.wandb_run_name}
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
-        accelerator.init_trackers("finetuning" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs)
+        accelerator.init_trackers(
+            "finetuning" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
+        )
 
     # For --sample_at_first
     train_util.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
@@ -354,7 +358,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
+                    args, noise_scheduler, latents
+                )
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -368,7 +374,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                 if args.min_snr_gamma or args.scale_v_pred_loss_like_noise_pred or args.debiased_estimation_loss:
                     # do not mean over batch dimension for snr weight or scale v-pred loss
-                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
+                    loss = train_util.conditional_loss(
+                        noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    )
                     loss = loss.mean([1, 2, 3])
 
                     if args.min_snr_gamma:
@@ -380,7 +388,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                     loss = loss.mean()  # mean over batch dimension
                 else:
-                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c)
+                    loss = train_util.conditional_loss(
+                        noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
+                    )
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients and args.max_grad_norm != 0.0:
@@ -471,7 +481,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
     accelerator.end_training()
 
-    if is_main_process and (args.save_state or args.save_state_on_train_end):        
+    if is_main_process and (args.save_state or args.save_state_on_train_end):
         train_util.save_state_on_train_end(args, accelerator)
 
     del accelerator  # この後メモリを使うのでこれは消す
diff --git a/library/train_util.py b/library/train_util.py
index 84764263e..410471470 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3180,6 +3180,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         default=None,
         help="specify WandB API key to log in before starting training (optional). / WandB APIキーを指定して学習開始前にログインする（オプション）",
     )
+    parser.add_argument("--log_config", action="store_true", help="log training configuration / 学習設定をログに出力する")
 
     parser.add_argument(
         "--noise_offset",
@@ -3388,7 +3389,15 @@ def add_masked_loss_arguments(parser: argparse.ArgumentParser):
         help="apply mask for calculating loss. conditioning_data_dir is required for dataset. / 損失計算時にマスクを適用する。datasetにはconditioning_data_dirが必要",
     )
 
-def filter_sensitive_args(args: argparse.Namespace):
+
+def get_sanitized_config_or_none(args: argparse.Namespace):
+    # if `--log_config` is enabled, return args for logging. if not, return None.
+    # when `--log_config is enabled, filter out sensitive values from args
+    # if wandb is not enabled, the log is not exposed to the public, but it is fine to filter out sensitive values to be safe
+
+    if not args.log_config:
+        return None
+
     sensitive_args = ["wandb_api_key", "huggingface_token"]
     sensitive_path_args = [
         "pretrained_model_name_or_path",
@@ -3402,9 +3411,9 @@ def filter_sensitive_args(args: argparse.Namespace):
     ]
     filtered_args = {}
     for k, v in vars(args).items():
-    # filter out sensitive values
+        # filter out sensitive values and convert to string if necessary
         if k not in sensitive_args + sensitive_path_args:
-            #Accelerate values need to have type `bool`,`str`, `float`, `int`, or `None`.
+            # Accelerate values need to have type `bool`,`str`, `float`, `int`, or `None`.
             if v is None or isinstance(v, bool) or isinstance(v, str) or isinstance(v, float) or isinstance(v, int):
                 filtered_args[k] = v
             # accelerate does not support lists
@@ -3416,6 +3425,7 @@ def filter_sensitive_args(args: argparse.Namespace):
 
     return filtered_args
 
+
 # verify command line args for training
 def verify_command_line_training_args(args: argparse.Namespace):
     # if wandb is enabled, the command line is exposed to the public
diff --git a/sdxl_train.py b/sdxl_train.py
index 4c4e38721..11f9892a3 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -589,7 +589,7 @@ def optimizer_hook(parameter: torch.Tensor):
             init_kwargs["wandb"] = {"name": args.wandb_run_name}
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
-        accelerator.init_trackers("finetuning" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs)
+        accelerator.init_trackers("finetuning" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs)
 
     # For --sample_at_first
     sdxl_train_util.sample_images(
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index b141965fa..301310901 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -354,7 +354,7 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
+            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
         )
 
     loss_recorder = train_util.LossRecorder()
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index 9490cf6f2..292a0463a 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -324,7 +324,7 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
+            "lllite_control_net_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
         )
 
     loss_recorder = train_util.LossRecorder()
diff --git a/train_controlnet.py b/train_controlnet.py
index 793f79c7d..9994dd99c 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -344,7 +344,7 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "controlnet_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
+            "controlnet_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
         )
 
     loss_recorder = train_util.LossRecorder()
diff --git a/train_db.py b/train_db.py
index 4f9018293..a5408cd3d 100644
--- a/train_db.py
+++ b/train_db.py
@@ -290,7 +290,7 @@ def train(args):
             init_kwargs["wandb"] = {"name": args.wandb_run_name}
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
-        accelerator.init_trackers("dreambooth" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs)
+        accelerator.init_trackers("dreambooth" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs)
 
     # For --sample_at_first
     train_util.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
diff --git a/train_network.py b/train_network.py
index 401a1c70e..38e4888e8 100644
--- a/train_network.py
+++ b/train_network.py
@@ -774,7 +774,7 @@ def load_model_hook(models, input_dir):
             if args.log_tracker_config is not None:
                 init_kwargs = toml.load(args.log_tracker_config)
             accelerator.init_trackers(
-                "network_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
+                "network_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
             )
 
         loss_recorder = train_util.LossRecorder()
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 56a387391..184607d1d 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -510,7 +510,7 @@ def train(self, args):
             if args.log_tracker_config is not None:
                 init_kwargs = toml.load(args.log_tracker_config)
             accelerator.init_trackers(
-                "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
+                "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
             )
 
         # function for saving/removing
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 691785239..8eed00fa1 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -407,7 +407,7 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.filter_sensitive_args(args), init_kwargs=init_kwargs
+            "textual_inversion" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
         )
 
     # function for saving/removing

From e4d9e3c843f5d9bfbfe56bd44c8f6a04d370201e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 19 May 2024 17:46:07 +0900
Subject: [PATCH 120/132] remove dependency for omegaconf #ref 1284

---
 README.md           |  4 ++++
 requirements.txt    |  1 -
 train_controlnet.py | 38 +++++++++++++++++++++++++++++++-------
 3 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index cd7744598..04769a4cf 100644
--- a/README.md
+++ b/README.md
@@ -167,6 +167,8 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 - Training scripts can now output training settings to wandb or Tensor Board logs. Specify the `--log_config` option. PR [#1285](https://github.com/kohya-ss/sd-scripts/pull/1285)  Thanks to ccharest93, plucked, rockerBOO, and VelocityRa!
   - Some settings, such as API keys and directory specifications, are not output due to security issues.
+
+- The ControlNet training script `train_controlnet.py` for SD1.5/2.x was not working, but it has been fixed. PR [#1284](https://github.com/kohya-ss/sd-scripts/pull/1284) Thanks to sdbds!
   
 - An option `--disable_mmap_load_safetensors` is added to disable memory mapping when loading the model's .safetensors in SDXL. PR [#1266](https://github.com/kohya-ss/sd-scripts/pull/1266) Thanks to Zovjsra!
   - It seems that the model file loading is faster in the WSL environment etc.
@@ -215,6 +217,8 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
 - 各学習スクリプトで学習設定を wandb や Tensor Board などのログに出力できるようになりました。`--log_config` オプションを指定してください。PR [#1285](https://github.com/kohya-ss/sd-scripts/pull/1285)  ccharest93 氏、plucked 氏、rockerBOO 氏および VelocityRa 氏に感謝します。
   - API キーや各種ディレクトリ指定など、一部の設定はセキュリティ上の問題があるため出力されません。
 
+- SD1.5/2.x 用の ControlNet 学習スクリプト `train_controlnet.py` が動作しなくなっていたのが修正されました。PR [#1284](https://github.com/kohya-ss/sd-scripts/pull/1284) sdbds 氏に感謝します。
+
 - SDXL でモデルの .safetensors を読み込む際にメモリマッピングを無効化するオプション `--disable_mmap_load_safetensors` が追加されました。PR [#1266](https://github.com/kohya-ss/sd-scripts/pull/1266) Zovjsra 氏に感謝します。
   - WSL 環境等でモデルファイルの読み込みが高速化されるようです。
   - `sdxl_train.py`、`sdxl_train_network.py`、`sdxl_train_textual_inversion.py`、`sdxl_train_control_net_lllite.py` で使用可能です。
diff --git a/requirements.txt b/requirements.txt
index 9495dab2a..e99775b8a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,7 +17,6 @@ easygui==0.98.3
 toml==0.10.2
 voluptuous==0.13.1
 huggingface-hub==0.20.1
-omegaconf==2.3.0
 # for Image utils
 imagesize==1.4.1
 # for BLIP captioning
diff --git a/train_controlnet.py b/train_controlnet.py
index 3a1fa9de6..c9ac6c5a8 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -5,7 +5,8 @@
 import random
 import time
 from multiprocessing import Value
-from omegaconf import OmegaConf
+
+# from omegaconf import OmegaConf
 import toml
 
 from tqdm import tqdm
@@ -13,6 +14,7 @@
 import torch
 from library import deepspeed_utils
 from library.device_utils import init_ipex, clean_memory_on_device
+
 init_ipex()
 
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -197,7 +199,23 @@ def train(args):
             "resnet_time_scale_shift": "default",
             "projection_class_embeddings_input_dim": None,
         }
-    unet.config = OmegaConf.create(unet.config)
+    # unet.config = OmegaConf.create(unet.config)
+
+    # make unet.config iterable and accessible by attribute
+    class CustomConfig:
+        def __init__(self, **kwargs):
+            self.__dict__.update(kwargs)
+
+        def __getattr__(self, name):
+            if name in self.__dict__:
+                return self.__dict__[name]
+            else:
+                raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
+
+        def __contains__(self, name):
+            return name in self.__dict__
+
+    unet.config = CustomConfig(**unet.config)
 
     controlnet = ControlNetModel.from_unet(unet)
 
@@ -230,7 +248,7 @@ def train(args):
             )
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
-        
+
         accelerator.wait_for_everyone()
 
     if args.gradient_checkpointing:
@@ -239,7 +257,7 @@ def train(args):
     # 学習に必要なクラスを準備する
     accelerator.print("prepare optimizer, data loader etc.")
 
-    trainable_params = controlnet.parameters()
+    trainable_params = list(controlnet.parameters())
 
     _, _, optimizer = train_util.get_optimizer(args, trainable_params)
 
@@ -348,7 +366,9 @@ def train(args):
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers(
-            "controlnet_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
+            "controlnet_train" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
         )
 
     loss_recorder = train_util.LossRecorder()
@@ -424,7 +444,9 @@ def remove_model(old_ckpt_name):
                     )
 
                 # Sample a random timestep for each image
-                timesteps, huber_c = train_util.get_timesteps_and_huber_c(args, 0, noise_scheduler.config.num_train_timesteps, noise_scheduler, b_size, latents.device)
+                timesteps, huber_c = train_util.get_timesteps_and_huber_c(
+                    args, 0, noise_scheduler.config.num_train_timesteps, noise_scheduler, b_size, latents.device
+                )
 
                 # Add noise to the latents according to the noise magnitude at each timestep
                 # (this is the forward diffusion process)
@@ -456,7 +478,9 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
+                loss = train_util.conditional_loss(
+                    noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                )
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight

From 4c798129b04955caad1c48405de168ff63a3809c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 19 May 2024 19:00:32 +0900
Subject: [PATCH 121/132] update README

---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 04769a4cf..d0f2d65b2 100644
--- a/README.md
+++ b/README.md
@@ -165,11 +165,14 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - Specify the learning rate and dim (rank) for each block.
   - See [Block-wise learning rates in LoRA](./docs/train_network_README-ja.md#階層別学習率) for details (Japanese only).
 
+- Negative learning rates can now be specified during SDXL model training. PR [#1277](https://github.com/kohya-ss/sd-scripts/pull/1277) Thanks to Cauldrath!
+  - The model is trained to move away from the training images, so the model is easily collapsed. Use with caution. A value close to 0 is recommended.
+
 - Training scripts can now output training settings to wandb or Tensor Board logs. Specify the `--log_config` option. PR [#1285](https://github.com/kohya-ss/sd-scripts/pull/1285)  Thanks to ccharest93, plucked, rockerBOO, and VelocityRa!
   - Some settings, such as API keys and directory specifications, are not output due to security issues.
 
 - The ControlNet training script `train_controlnet.py` for SD1.5/2.x was not working, but it has been fixed. PR [#1284](https://github.com/kohya-ss/sd-scripts/pull/1284) Thanks to sdbds!
-  
+
 - An option `--disable_mmap_load_safetensors` is added to disable memory mapping when loading the model's .safetensors in SDXL. PR [#1266](https://github.com/kohya-ss/sd-scripts/pull/1266) Thanks to Zovjsra!
   - It seems that the model file loading is faster in the WSL environment etc.
   - Available in `sdxl_train.py`, `sdxl_train_network.py`, `sdxl_train_textual_inversion.py`, and `sdxl_train_control_net_lllite.py`.
@@ -214,6 +217,9 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
   - ブロックごとに学習率および dim (rank) を指定することができます。
   - 詳細は [LoRA の階層別学習率](./docs/train_network_README-ja.md#階層別学習率) をご覧ください。
 
+- `sdxl_train.py` での SDXL モデル学習時に負の学習率が指定できるようになりました。PR [#1277](https://github.com/kohya-ss/sd-scripts/pull/1277) Cauldrath 氏に感謝します。
+  - 学習画像から離れるように学習するため、モデルは容易に崩壊します。注意して使用してください。0 に近い値を推奨します。
+
 - 各学習スクリプトで学習設定を wandb や Tensor Board などのログに出力できるようになりました。`--log_config` オプションを指定してください。PR [#1285](https://github.com/kohya-ss/sd-scripts/pull/1285)  ccharest93 氏、plucked 氏、rockerBOO 氏および VelocityRa 氏に感謝します。
   - API キーや各種ディレクトリ指定など、一部の設定はセキュリティ上の問題があるため出力されません。
 

From febc5c59fad74dfcead9064033171a9c674e4870 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 19 May 2024 19:03:43 +0900
Subject: [PATCH 122/132] update README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index d0f2d65b2..838e4022c 100644
--- a/README.md
+++ b/README.md
@@ -167,6 +167,7 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 - Negative learning rates can now be specified during SDXL model training. PR [#1277](https://github.com/kohya-ss/sd-scripts/pull/1277) Thanks to Cauldrath!
   - The model is trained to move away from the training images, so the model is easily collapsed. Use with caution. A value close to 0 is recommended.
+  - When specifying from the command line, use `=` like `--learning_rate=-1e-7`.
 
 - Training scripts can now output training settings to wandb or Tensor Board logs. Specify the `--log_config` option. PR [#1285](https://github.com/kohya-ss/sd-scripts/pull/1285)  Thanks to ccharest93, plucked, rockerBOO, and VelocityRa!
   - Some settings, such as API keys and directory specifications, are not output due to security issues.
@@ -219,6 +220,7 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
 
 - `sdxl_train.py` での SDXL モデル学習時に負の学習率が指定できるようになりました。PR [#1277](https://github.com/kohya-ss/sd-scripts/pull/1277) Cauldrath 氏に感謝します。
   - 学習画像から離れるように学習するため、モデルは容易に崩壊します。注意して使用してください。0 に近い値を推奨します。
+  - コマンドラインから指定する場合、`--learning_rate=-1e-7` のように`=` を使ってください。
 
 - 各学習スクリプトで学習設定を wandb や Tensor Board などのログに出力できるようになりました。`--log_config` オプションを指定してください。PR [#1285](https://github.com/kohya-ss/sd-scripts/pull/1285)  ccharest93 氏、plucked 氏、rockerBOO 氏および VelocityRa 氏に感謝します。
   - API キーや各種ディレクトリ指定など、一部の設定はセキュリティ上の問題があるため出力されません。

From db6752901fc204686e460255797b188cb28611a5 Mon Sep 17 00:00:00 2001
From: u-haru <40634644+u-haru@users.noreply.github.com>
Date: Sun, 19 May 2024 19:07:25 +0900
Subject: [PATCH 123/132] =?UTF-8?q?=E7=94=BB=E5=83=8F=E3=81=AE=E3=82=A2?=
 =?UTF-8?q?=E3=83=AB=E3=83=95=E3=82=A1=E3=83=81=E3=83=A3=E3=83=B3=E3=83=8D?=
 =?UTF-8?q?=E3=83=AB=E3=82=92loss=E3=81=AE=E3=83=9E=E3=82=B9=E3=82=AF?=
 =?UTF-8?q?=E3=81=A8=E3=81=97=E3=81=A6=E4=BD=BF=E7=94=A8=E3=81=99=E3=82=8B?=
 =?UTF-8?q?=E3=82=AA=E3=83=97=E3=82=B7=E3=83=A7=E3=83=B3=E3=82=92=E8=BF=BD?=
 =?UTF-8?q?=E5=8A=A0=20(#1223)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add alpha_mask parameter and apply masked loss

* Fix type hint in trim_and_resize_if_required function

* Refactor code to use keyword arguments in train_util.py

* Fix alpha mask flipping logic

* Fix alpha mask initialization

* Fix alpha_mask transformation

* Cache alpha_mask

* Update alpha_masks to be on CPU

* Set flipped_alpha_masks to Null if option disabled

* Check if alpha_mask is None

* Set alpha_mask to None if option disabled

* Add description of alpha_mask option to docs
---
 docs/train_network_README-ja.md   |   2 +
 docs/train_network_README-zh.md   |   2 +
 library/config_util.py            |   2 +
 library/custom_train_functions.py |   5 +-
 library/train_util.py             | 203 ++++++++++++------------------
 sdxl_train.py                     |   4 +-
 train_db.py                       |   4 +-
 train_network.py                  |   4 +-
 train_textual_inversion.py        |   4 +-
 train_textual_inversion_XTI.py    |   4 +-
 10 files changed, 105 insertions(+), 129 deletions(-)

diff --git a/docs/train_network_README-ja.md b/docs/train_network_README-ja.md
index 46085117c..55c80c4b0 100644
--- a/docs/train_network_README-ja.md
+++ b/docs/train_network_README-ja.md
@@ -102,6 +102,8 @@ accelerate launch --num_cpu_threads_per_process 1 train_network.py
   * Text Encoderに関連するLoRAモジュールに、通常の学習率（--learning_rateオプションで指定）とは異なる学習率を使う時に指定します。Text Encoderのほうを若干低めの学習率（5e-5など）にしたほうが良い、という話もあるようです。
 * `--network_args`
   * 複数の引数を指定できます。後述します。
+* `--alpha_mask`
+  * 画像のアルファ値をマスクとして使用します。透過画像を学習する際に使用します。[PR #1223](https://github.com/kohya-ss/sd-scripts/pull/1223)
 
 `--network_train_unet_only` と `--network_train_text_encoder_only` の両方とも未指定時（デフォルト）はText EncoderとU-Netの両方のLoRAモジュールを有効にします。
 
diff --git a/docs/train_network_README-zh.md b/docs/train_network_README-zh.md
index ed7a0c4ef..830014f72 100644
--- a/docs/train_network_README-zh.md
+++ b/docs/train_network_README-zh.md
@@ -101,6 +101,8 @@ LoRA的模型将会被保存在通过`--output_dir`选项指定的文件夹中
   * 当在Text Encoder相关的LoRA模块中使用与常规学习率（由`--learning_rate`选项指定）不同的学习率时，应指定此选项。可能最好将Text Encoder的学习率稍微降低（例如5e-5）。
 * `--network_args`
   * 可以指定多个参数。将在下面详细说明。
+* `--alpha_mask`
+  * 使用图像的 Alpha 值作为遮罩。这在学习透明图像时使用。[PR #1223](https://github.com/kohya-ss/sd-scripts/pull/1223)
 
 当未指定`--network_train_unet_only`和`--network_train_text_encoder_only`时（默认情况），将启用Text Encoder和U-Net的两个LoRA模块。
 
diff --git a/library/config_util.py b/library/config_util.py
index 59f5f86d2..82baab83e 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -78,6 +78,7 @@ class BaseSubsetParams:
     caption_tag_dropout_rate: float = 0.0
     token_warmup_min: int = 1
     token_warmup_step: float = 0
+    alpha_mask: bool = False
 
 
 @dataclass
@@ -538,6 +539,7 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
           random_crop: {subset.random_crop}
           token_warmup_min: {subset.token_warmup_min},
           token_warmup_step: {subset.token_warmup_step},
+          alpha_mask: {subset.alpha_mask},
       """
                 ),
                 "  ",
diff --git a/library/custom_train_functions.py b/library/custom_train_functions.py
index 406e0e36e..fad127405 100644
--- a/library/custom_train_functions.py
+++ b/library/custom_train_functions.py
@@ -479,9 +479,10 @@ def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale):
     return noise
 
 
-def apply_masked_loss(loss, batch):
+def apply_masked_loss(loss, mask_image):
     # mask image is -1 to 1. we need to convert it to 0 to 1
-    mask_image = batch["conditioning_images"].to(dtype=loss.dtype)[:, 0].unsqueeze(1)  # use R channel
+    # mask_image = batch["conditioning_images"].to(dtype=loss.dtype)[:, 0].unsqueeze(1)  # use R channel
+    mask_image = mask_image.to(dtype=loss.dtype)
 
     # resize to the same size as the loss
     mask_image = torch.nn.functional.interpolate(mask_image, size=loss.shape[2:], mode="area")
diff --git a/library/train_util.py b/library/train_util.py
index 410471470..20f8055dc 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -159,6 +159,9 @@ def __init__(self, image_key: str, num_repeats: int, caption: str, is_reg: bool,
         self.text_encoder_outputs1: Optional[torch.Tensor] = None
         self.text_encoder_outputs2: Optional[torch.Tensor] = None
         self.text_encoder_pool2: Optional[torch.Tensor] = None
+        self.alpha_mask: Optional[torch.Tensor] = None
+        self.alpha_mask_flipped: Optional[torch.Tensor] = None
+        self.use_alpha_mask: bool = False
 
 
 class BucketManager:
@@ -379,6 +382,7 @@ def __init__(
         caption_suffix: Optional[str],
         token_warmup_min: int,
         token_warmup_step: Union[float, int],
+        alpha_mask: bool,
     ) -> None:
         self.image_dir = image_dir
         self.num_repeats = num_repeats
@@ -403,6 +407,7 @@ def __init__(
 
         self.img_count = 0
 
+        self.alpha_mask = alpha_mask
 
 class DreamBoothSubset(BaseSubset):
     def __init__(
@@ -412,47 +417,13 @@ def __init__(
         class_tokens: Optional[str],
         caption_extension: str,
         cache_info: bool,
-        num_repeats,
-        shuffle_caption,
-        caption_separator: str,
-        keep_tokens,
-        keep_tokens_separator,
-        secondary_separator,
-        enable_wildcard,
-        color_aug,
-        flip_aug,
-        face_crop_aug_range,
-        random_crop,
-        caption_dropout_rate,
-        caption_dropout_every_n_epochs,
-        caption_tag_dropout_rate,
-        caption_prefix,
-        caption_suffix,
-        token_warmup_min,
-        token_warmup_step,
+        **kwargs,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
         super().__init__(
             image_dir,
-            num_repeats,
-            shuffle_caption,
-            caption_separator,
-            keep_tokens,
-            keep_tokens_separator,
-            secondary_separator,
-            enable_wildcard,
-            color_aug,
-            flip_aug,
-            face_crop_aug_range,
-            random_crop,
-            caption_dropout_rate,
-            caption_dropout_every_n_epochs,
-            caption_tag_dropout_rate,
-            caption_prefix,
-            caption_suffix,
-            token_warmup_min,
-            token_warmup_step,
+            **kwargs,
         )
 
         self.is_reg = is_reg
@@ -473,47 +444,13 @@ def __init__(
         self,
         image_dir,
         metadata_file: str,
-        num_repeats,
-        shuffle_caption,
-        caption_separator,
-        keep_tokens,
-        keep_tokens_separator,
-        secondary_separator,
-        enable_wildcard,
-        color_aug,
-        flip_aug,
-        face_crop_aug_range,
-        random_crop,
-        caption_dropout_rate,
-        caption_dropout_every_n_epochs,
-        caption_tag_dropout_rate,
-        caption_prefix,
-        caption_suffix,
-        token_warmup_min,
-        token_warmup_step,
+        **kwargs,
     ) -> None:
         assert metadata_file is not None, "metadata_file must be specified / metadata_fileは指定が必須です"
 
         super().__init__(
             image_dir,
-            num_repeats,
-            shuffle_caption,
-            caption_separator,
-            keep_tokens,
-            keep_tokens_separator,
-            secondary_separator,
-            enable_wildcard,
-            color_aug,
-            flip_aug,
-            face_crop_aug_range,
-            random_crop,
-            caption_dropout_rate,
-            caption_dropout_every_n_epochs,
-            caption_tag_dropout_rate,
-            caption_prefix,
-            caption_suffix,
-            token_warmup_min,
-            token_warmup_step,
+            **kwargs,
         )
 
         self.metadata_file = metadata_file
@@ -531,47 +468,13 @@ def __init__(
         conditioning_data_dir: str,
         caption_extension: str,
         cache_info: bool,
-        num_repeats,
-        shuffle_caption,
-        caption_separator,
-        keep_tokens,
-        keep_tokens_separator,
-        secondary_separator,
-        enable_wildcard,
-        color_aug,
-        flip_aug,
-        face_crop_aug_range,
-        random_crop,
-        caption_dropout_rate,
-        caption_dropout_every_n_epochs,
-        caption_tag_dropout_rate,
-        caption_prefix,
-        caption_suffix,
-        token_warmup_min,
-        token_warmup_step,
+        **kwargs,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
         super().__init__(
             image_dir,
-            num_repeats,
-            shuffle_caption,
-            caption_separator,
-            keep_tokens,
-            keep_tokens_separator,
-            secondary_separator,
-            enable_wildcard,
-            color_aug,
-            flip_aug,
-            face_crop_aug_range,
-            random_crop,
-            caption_dropout_rate,
-            caption_dropout_every_n_epochs,
-            caption_tag_dropout_rate,
-            caption_prefix,
-            caption_suffix,
-            token_warmup_min,
-            token_warmup_step,
+            **kwargs,
         )
 
         self.conditioning_data_dir = conditioning_data_dir
@@ -985,6 +888,8 @@ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_proc
         for info in tqdm(image_infos):
             subset = self.image_to_subset[info.image_key]
 
+            info.use_alpha_mask = subset.alpha_mask
+
             if info.latents_npz is not None:  # fine tuning dataset
                 continue
 
@@ -1088,8 +993,8 @@ def cache_text_encoder_outputs(
     def get_image_size(self, image_path):
         return imagesize.get(image_path)
 
-    def load_image_with_face_info(self, subset: BaseSubset, image_path: str):
-        img = load_image(image_path)
+    def load_image_with_face_info(self, subset: BaseSubset, image_path: str, alpha_mask=False):
+        img = load_image(image_path, alpha_mask)
 
         face_cx = face_cy = face_w = face_h = 0
         if subset.face_crop_aug_range is not None:
@@ -1166,6 +1071,7 @@ def __getitem__(self, index):
         input_ids_list = []
         input_ids2_list = []
         latents_list = []
+        alpha_mask_list = []
         images = []
         original_sizes_hw = []
         crop_top_lefts = []
@@ -1190,21 +1096,27 @@ def __getitem__(self, index):
                 crop_ltrb = image_info.latents_crop_ltrb  # calc values later if flipped
                 if not flipped:
                     latents = image_info.latents
+                    alpha_mask = image_info.alpha_mask
                 else:
                     latents = image_info.latents_flipped
-
+                    alpha_mask = image_info.alpha_mask_flipped
+                
                 image = None
             elif image_info.latents_npz is not None:  # FineTuningDatasetまたはcache_latents_to_disk=Trueの場合
-                latents, original_size, crop_ltrb, flipped_latents = load_latents_from_disk(image_info.latents_npz)
+                latents, original_size, crop_ltrb, flipped_latents, alpha_mask, flipped_alpha_mask = load_latents_from_disk(image_info.latents_npz)
                 if flipped:
                     latents = flipped_latents
+                    alpha_mask = flipped_alpha_mask
                     del flipped_latents
+                    del flipped_alpha_mask
                 latents = torch.FloatTensor(latents)
+                if alpha_mask is not None:
+                    alpha_mask = torch.FloatTensor(alpha_mask)
 
                 image = None
             else:
                 # 画像を読み込み、必要ならcropする
-                img, face_cx, face_cy, face_w, face_h = self.load_image_with_face_info(subset, image_info.absolute_path)
+                img, face_cx, face_cy, face_w, face_h = self.load_image_with_face_info(subset, image_info.absolute_path, subset.alpha_mask)
                 im_h, im_w = img.shape[0:2]
 
                 if self.enable_bucket:
@@ -1241,11 +1153,22 @@ def __getitem__(self, index):
                 if flipped:
                     img = img[:, ::-1, :].copy()  # copy to avoid negative stride problem
 
+                if subset.alpha_mask:
+                    if img.shape[2] == 4:
+                        alpha_mask = img[:, :, 3]  # [W,H]
+                    else:
+                        alpha_mask = np.full((im_w, im_h), 255, dtype=np.uint8) # [W,H]
+                    alpha_mask = transforms.ToTensor()(alpha_mask)
+                else:
+                    alpha_mask = None
+                img = img[:, :, :3]  # remove alpha channel
+
                 latents = None
                 image = self.image_transforms(img)  # -1.0~1.0のtorch.Tensorになる
 
             images.append(image)
             latents_list.append(latents)
+            alpha_mask_list.append(alpha_mask)
 
             target_size = (image.shape[2], image.shape[1]) if image is not None else (latents.shape[2] * 8, latents.shape[1] * 8)
 
@@ -1348,6 +1271,8 @@ def __getitem__(self, index):
 
         example["network_multipliers"] = torch.FloatTensor([self.network_multiplier] * len(captions))
 
+        example["alpha_mask"] = torch.stack(alpha_mask_list) if alpha_mask_list[0] is not None else None
+
         if self.debug_dataset:
             example["image_keys"] = bucket[image_index : image_index + self.batch_size]
         return example
@@ -2145,7 +2070,7 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool):
 # 戻り値は、latents_tensor, (original_size width, original_size height), (crop left, crop top)
 def load_latents_from_disk(
     npz_path,
-) -> Tuple[Optional[torch.Tensor], Optional[List[int]], Optional[List[int]], Optional[torch.Tensor]]:
+) -> Tuple[Optional[torch.Tensor], Optional[List[int]], Optional[List[int]], Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
     npz = np.load(npz_path)
     if "latents" not in npz:
         raise ValueError(f"error: npz is old format. please re-generate {npz_path}")
@@ -2154,13 +2079,19 @@ def load_latents_from_disk(
     original_size = npz["original_size"].tolist()
     crop_ltrb = npz["crop_ltrb"].tolist()
     flipped_latents = npz["latents_flipped"] if "latents_flipped" in npz else None
-    return latents, original_size, crop_ltrb, flipped_latents
+    alpha_mask = npz["alpha_mask"] if "alpha_mask" in npz else None
+    flipped_alpha_mask = npz["flipped_alpha_mask"] if "flipped_alpha_mask" in npz else None
+    return latents, original_size, crop_ltrb, flipped_latents, alpha_mask, flipped_alpha_mask
 
 
-def save_latents_to_disk(npz_path, latents_tensor, original_size, crop_ltrb, flipped_latents_tensor=None):
+def save_latents_to_disk(npz_path, latents_tensor, original_size, crop_ltrb, flipped_latents_tensor=None, alpha_mask=None, flipped_alpha_mask=None):
     kwargs = {}
     if flipped_latents_tensor is not None:
         kwargs["latents_flipped"] = flipped_latents_tensor.float().cpu().numpy()
+    if alpha_mask is not None:
+        kwargs["alpha_mask"] = alpha_mask.float().cpu().numpy()
+    if flipped_alpha_mask is not None:
+        kwargs["flipped_alpha_mask"] = flipped_alpha_mask.float().cpu().numpy()
     np.savez(
         npz_path,
         latents=latents_tensor.float().cpu().numpy(),
@@ -2349,17 +2280,20 @@ def load_arbitrary_dataset(args, tokenizer) -> MinimalDataset:
     return train_dataset_group
 
 
-def load_image(image_path):
+def load_image(image_path, alpha=False):
     image = Image.open(image_path)
     if not image.mode == "RGB":
-        image = image.convert("RGB")
+        if alpha:
+            image = image.convert("RGBA")
+        else:
+            image = image.convert("RGB")
     img = np.array(image, np.uint8)
     return img
 
 
 # 画像を読み込む。戻り値はnumpy.ndarray,(original width, original height),(crop left, crop top, crop right, crop bottom)
 def trim_and_resize_if_required(
-    random_crop: bool, image: Image.Image, reso, resized_size: Tuple[int, int]
+    random_crop: bool, image: np.ndarray, reso, resized_size: Tuple[int, int]
 ) -> Tuple[np.ndarray, Tuple[int, int], Tuple[int, int, int, int]]:
     image_height, image_width = image.shape[0:2]
     original_size = (image_width, image_height)  # size before resize
@@ -2403,10 +2337,18 @@ def cache_batch_latents(
     latents_original_size and latents_crop_ltrb are also set
     """
     images = []
+    alpha_masks = []
     for info in image_infos:
-        image = load_image(info.absolute_path) if info.image is None else np.array(info.image, np.uint8)
+        image = load_image(info.absolute_path, info.use_alpha_mask) if info.image is None else np.array(info.image, np.uint8)
         # TODO 画像のメタデータが壊れていて、メタデータから割り当てたbucketと実際の画像サイズが一致しない場合があるのでチェック追加要
         image, original_size, crop_ltrb = trim_and_resize_if_required(random_crop, image, info.bucket_reso, info.resized_size)
+        if info.use_alpha_mask:
+            if image.shape[2] == 4:
+                alpha_mask = image[:, :, 3] # [W,H]
+                image = image[:, :, :3]
+            else:
+                alpha_mask = np.full_like(image[:, :, 0], 255, dtype=np.uint8) # [W,H]
+            alpha_masks.append(transforms.ToTensor()(alpha_mask))
         image = IMAGE_TRANSFORMS(image)
         images.append(image)
 
@@ -2419,25 +2361,37 @@ def cache_batch_latents(
     with torch.no_grad():
         latents = vae.encode(img_tensors).latent_dist.sample().to("cpu")
 
+    if info.use_alpha_mask:
+        alpha_masks = torch.stack(alpha_masks, dim=0).to("cpu")
+    else:
+        alpha_masks = [None] * len(image_infos)
+        flipped_alpha_masks = [None] * len(image_infos)
+
     if flip_aug:
         img_tensors = torch.flip(img_tensors, dims=[3])
         with torch.no_grad():
             flipped_latents = vae.encode(img_tensors).latent_dist.sample().to("cpu")
+        if info.use_alpha_mask:
+            flipped_alpha_masks = torch.flip(alpha_masks, dims=[3])
     else:
         flipped_latents = [None] * len(latents)
+        flipped_alpha_masks = [None] * len(image_infos)
 
-    for info, latent, flipped_latent in zip(image_infos, latents, flipped_latents):
+    for info, latent, flipped_latent, alpha_mask, flipped_alpha_mask in zip(image_infos, latents, flipped_latents, alpha_masks, flipped_alpha_masks):
         # check NaN
         if torch.isnan(latents).any() or (flipped_latent is not None and torch.isnan(flipped_latent).any()):
             raise RuntimeError(f"NaN detected in latents: {info.absolute_path}")
 
         if cache_to_disk:
-            save_latents_to_disk(info.latents_npz, latent, info.latents_original_size, info.latents_crop_ltrb, flipped_latent)
+            save_latents_to_disk(info.latents_npz, latent, info.latents_original_size, info.latents_crop_ltrb, flipped_latent, alpha_mask, flipped_alpha_mask)
         else:
             info.latents = latent
             if flip_aug:
                 info.latents_flipped = flipped_latent
 
+            info.alpha_mask = alpha_mask
+            info.alpha_mask_flipped = flipped_alpha_mask
+
     if not HIGH_VRAM:
         clean_memory_on_device(vae.device)
 
@@ -3683,6 +3637,11 @@ def add_dataset_arguments(
         default=0,
         help="tag length reaches maximum on N steps (or N*max_train_steps if N<1) / N（N<1ならN*max_train_steps）ステップでタグ長が最大になる。デフォルトは0（最初から最大）",
     )
+    parser.add_argument(
+        "--alpha_mask",
+        action="store_true",
+        help="use alpha channel as mask for training / 画像のアルファチャンネルをlossのマスクに使用する",
+    )
 
     parser.add_argument(
         "--dataset_class",
diff --git a/sdxl_train.py b/sdxl_train.py
index 7c71a5133..dcd06766b 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -712,7 +712,9 @@ def optimizer_hook(parameter: torch.Tensor):
                         noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
                     )
                     if args.masked_loss:
-                        loss = apply_masked_loss(loss, batch)
+                        loss = apply_masked_loss(loss, batch["conditioning_images"][:, 0].unsqueeze(1))
+                    if "alpha_mask" in batch and batch["alpha_mask"] is not None:
+                        loss = apply_masked_loss(loss, batch["alpha_mask"])
                     loss = loss.mean([1, 2, 3])
 
                     if args.min_snr_gamma:
diff --git a/train_db.py b/train_db.py
index a5408cd3d..c46900006 100644
--- a/train_db.py
+++ b/train_db.py
@@ -360,7 +360,9 @@ def train(args):
 
                 loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 if args.masked_loss:
-                    loss = apply_masked_loss(loss, batch)
+                    loss = apply_masked_loss(loss, batch["conditioning_images"][:, 0].unsqueeze(1))
+                if "alpha_mask" in batch and batch["alpha_mask"] is not None:
+                    loss = apply_masked_loss(loss, batch["alpha_mask"])
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_network.py b/train_network.py
index 38e4888e8..cd1677ad2 100644
--- a/train_network.py
+++ b/train_network.py
@@ -903,7 +903,9 @@ def remove_model(old_ckpt_name):
                         noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
                     )
                     if args.masked_loss:
-                        loss = apply_masked_loss(loss, batch)
+                        loss = apply_masked_loss(loss, batch["conditioning_images"][:, 0].unsqueeze(1))
+                    if "alpha_mask" in batch and batch["alpha_mask"] is not None:
+                        loss = apply_masked_loss(loss, batch["alpha_mask"])
                     loss = loss.mean([1, 2, 3])
 
                     loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 184607d1d..a9c2a1094 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -590,7 +590,9 @@ def remove_model(old_ckpt_name):
 
                     loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                     if args.masked_loss:
-                        loss = apply_masked_loss(loss, batch)
+                        loss = apply_masked_loss(loss, batch["conditioning_images"][:, 0].unsqueeze(1))
+                    if "alpha_mask" in batch and batch["alpha_mask"] is not None:
+                        loss = apply_masked_loss(loss, batch["alpha_mask"])
                     loss = loss.mean([1, 2, 3])
 
                     loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 8eed00fa1..959839cbb 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -475,7 +475,9 @@ def remove_model(old_ckpt_name):
 
                 loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 if args.masked_loss:
-                    loss = apply_masked_loss(loss, batch)
+                    loss = apply_masked_loss(loss, batch["conditioning_images"][:, 0].unsqueeze(1))
+                if "alpha_mask" in batch and batch["alpha_mask"] is not None:
+                    loss = apply_masked_loss(loss, batch["alpha_mask"])
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight

From f2dd43e198f4bc059f4790ada041fa8f2a305f25 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 19 May 2024 19:23:59 +0900
Subject: [PATCH 124/132] revert kwargs to explicit declaration

---
 library/train_util.py | 158 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 142 insertions(+), 16 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 20f8055dc..6cf285903 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -409,6 +409,7 @@ def __init__(
 
         self.alpha_mask = alpha_mask
 
+
 class DreamBoothSubset(BaseSubset):
     def __init__(
         self,
@@ -417,13 +418,47 @@ def __init__(
         class_tokens: Optional[str],
         caption_extension: str,
         cache_info: bool,
-        **kwargs,
+        num_repeats,
+        shuffle_caption,
+        caption_separator: str,
+        keep_tokens,
+        keep_tokens_separator,
+        secondary_separator,
+        enable_wildcard,
+        color_aug,
+        flip_aug,
+        face_crop_aug_range,
+        random_crop,
+        caption_dropout_rate,
+        caption_dropout_every_n_epochs,
+        caption_tag_dropout_rate,
+        caption_prefix,
+        caption_suffix,
+        token_warmup_min,
+        token_warmup_step,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
         super().__init__(
             image_dir,
-            **kwargs,
+            num_repeats,
+            shuffle_caption,
+            caption_separator,
+            keep_tokens,
+            keep_tokens_separator,
+            secondary_separator,
+            enable_wildcard,
+            color_aug,
+            flip_aug,
+            face_crop_aug_range,
+            random_crop,
+            caption_dropout_rate,
+            caption_dropout_every_n_epochs,
+            caption_tag_dropout_rate,
+            caption_prefix,
+            caption_suffix,
+            token_warmup_min,
+            token_warmup_step,
         )
 
         self.is_reg = is_reg
@@ -444,13 +479,47 @@ def __init__(
         self,
         image_dir,
         metadata_file: str,
-        **kwargs,
+        num_repeats,
+        shuffle_caption,
+        caption_separator,
+        keep_tokens,
+        keep_tokens_separator,
+        secondary_separator,
+        enable_wildcard,
+        color_aug,
+        flip_aug,
+        face_crop_aug_range,
+        random_crop,
+        caption_dropout_rate,
+        caption_dropout_every_n_epochs,
+        caption_tag_dropout_rate,
+        caption_prefix,
+        caption_suffix,
+        token_warmup_min,
+        token_warmup_step,
     ) -> None:
         assert metadata_file is not None, "metadata_file must be specified / metadata_fileは指定が必須です"
 
         super().__init__(
             image_dir,
-            **kwargs,
+            num_repeats,
+            shuffle_caption,
+            caption_separator,
+            keep_tokens,
+            keep_tokens_separator,
+            secondary_separator,
+            enable_wildcard,
+            color_aug,
+            flip_aug,
+            face_crop_aug_range,
+            random_crop,
+            caption_dropout_rate,
+            caption_dropout_every_n_epochs,
+            caption_tag_dropout_rate,
+            caption_prefix,
+            caption_suffix,
+            token_warmup_min,
+            token_warmup_step,
         )
 
         self.metadata_file = metadata_file
@@ -468,13 +537,47 @@ def __init__(
         conditioning_data_dir: str,
         caption_extension: str,
         cache_info: bool,
-        **kwargs,
+        num_repeats,
+        shuffle_caption,
+        caption_separator,
+        keep_tokens,
+        keep_tokens_separator,
+        secondary_separator,
+        enable_wildcard,
+        color_aug,
+        flip_aug,
+        face_crop_aug_range,
+        random_crop,
+        caption_dropout_rate,
+        caption_dropout_every_n_epochs,
+        caption_tag_dropout_rate,
+        caption_prefix,
+        caption_suffix,
+        token_warmup_min,
+        token_warmup_step,
     ) -> None:
         assert image_dir is not None, "image_dir must be specified / image_dirは指定が必須です"
 
         super().__init__(
             image_dir,
-            **kwargs,
+            num_repeats,
+            shuffle_caption,
+            caption_separator,
+            keep_tokens,
+            keep_tokens_separator,
+            secondary_separator,
+            enable_wildcard,
+            color_aug,
+            flip_aug,
+            face_crop_aug_range,
+            random_crop,
+            caption_dropout_rate,
+            caption_dropout_every_n_epochs,
+            caption_tag_dropout_rate,
+            caption_prefix,
+            caption_suffix,
+            token_warmup_min,
+            token_warmup_step,
         )
 
         self.conditioning_data_dir = conditioning_data_dir
@@ -1100,10 +1203,12 @@ def __getitem__(self, index):
                 else:
                     latents = image_info.latents_flipped
                     alpha_mask = image_info.alpha_mask_flipped
-                
+
                 image = None
             elif image_info.latents_npz is not None:  # FineTuningDatasetまたはcache_latents_to_disk=Trueの場合
-                latents, original_size, crop_ltrb, flipped_latents, alpha_mask, flipped_alpha_mask = load_latents_from_disk(image_info.latents_npz)
+                latents, original_size, crop_ltrb, flipped_latents, alpha_mask, flipped_alpha_mask = load_latents_from_disk(
+                    image_info.latents_npz
+                )
                 if flipped:
                     latents = flipped_latents
                     alpha_mask = flipped_alpha_mask
@@ -1116,7 +1221,9 @@ def __getitem__(self, index):
                 image = None
             else:
                 # 画像を読み込み、必要ならcropする
-                img, face_cx, face_cy, face_w, face_h = self.load_image_with_face_info(subset, image_info.absolute_path, subset.alpha_mask)
+                img, face_cx, face_cy, face_w, face_h = self.load_image_with_face_info(
+                    subset, image_info.absolute_path, subset.alpha_mask
+                )
                 im_h, im_w = img.shape[0:2]
 
                 if self.enable_bucket:
@@ -1157,7 +1264,7 @@ def __getitem__(self, index):
                     if img.shape[2] == 4:
                         alpha_mask = img[:, :, 3]  # [W,H]
                     else:
-                        alpha_mask = np.full((im_w, im_h), 255, dtype=np.uint8) # [W,H]
+                        alpha_mask = np.full((im_w, im_h), 255, dtype=np.uint8)  # [W,H]
                     alpha_mask = transforms.ToTensor()(alpha_mask)
                 else:
                     alpha_mask = None
@@ -2070,7 +2177,14 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool):
 # 戻り値は、latents_tensor, (original_size width, original_size height), (crop left, crop top)
 def load_latents_from_disk(
     npz_path,
-) -> Tuple[Optional[torch.Tensor], Optional[List[int]], Optional[List[int]], Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+) -> Tuple[
+    Optional[torch.Tensor],
+    Optional[List[int]],
+    Optional[List[int]],
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+]:
     npz = np.load(npz_path)
     if "latents" not in npz:
         raise ValueError(f"error: npz is old format. please re-generate {npz_path}")
@@ -2084,7 +2198,9 @@ def load_latents_from_disk(
     return latents, original_size, crop_ltrb, flipped_latents, alpha_mask, flipped_alpha_mask
 
 
-def save_latents_to_disk(npz_path, latents_tensor, original_size, crop_ltrb, flipped_latents_tensor=None, alpha_mask=None, flipped_alpha_mask=None):
+def save_latents_to_disk(
+    npz_path, latents_tensor, original_size, crop_ltrb, flipped_latents_tensor=None, alpha_mask=None, flipped_alpha_mask=None
+):
     kwargs = {}
     if flipped_latents_tensor is not None:
         kwargs["latents_flipped"] = flipped_latents_tensor.float().cpu().numpy()
@@ -2344,10 +2460,10 @@ def cache_batch_latents(
         image, original_size, crop_ltrb = trim_and_resize_if_required(random_crop, image, info.bucket_reso, info.resized_size)
         if info.use_alpha_mask:
             if image.shape[2] == 4:
-                alpha_mask = image[:, :, 3] # [W,H]
+                alpha_mask = image[:, :, 3]  # [W,H]
                 image = image[:, :, :3]
             else:
-                alpha_mask = np.full_like(image[:, :, 0], 255, dtype=np.uint8) # [W,H]
+                alpha_mask = np.full_like(image[:, :, 0], 255, dtype=np.uint8)  # [W,H]
             alpha_masks.append(transforms.ToTensor()(alpha_mask))
         image = IMAGE_TRANSFORMS(image)
         images.append(image)
@@ -2377,13 +2493,23 @@ def cache_batch_latents(
         flipped_latents = [None] * len(latents)
         flipped_alpha_masks = [None] * len(image_infos)
 
-    for info, latent, flipped_latent, alpha_mask, flipped_alpha_mask in zip(image_infos, latents, flipped_latents, alpha_masks, flipped_alpha_masks):
+    for info, latent, flipped_latent, alpha_mask, flipped_alpha_mask in zip(
+        image_infos, latents, flipped_latents, alpha_masks, flipped_alpha_masks
+    ):
         # check NaN
         if torch.isnan(latents).any() or (flipped_latent is not None and torch.isnan(flipped_latent).any()):
             raise RuntimeError(f"NaN detected in latents: {info.absolute_path}")
 
         if cache_to_disk:
-            save_latents_to_disk(info.latents_npz, latent, info.latents_original_size, info.latents_crop_ltrb, flipped_latent, alpha_mask, flipped_alpha_mask)
+            save_latents_to_disk(
+                info.latents_npz,
+                latent,
+                info.latents_original_size,
+                info.latents_crop_ltrb,
+                flipped_latent,
+                alpha_mask,
+                flipped_alpha_mask,
+            )
         else:
             info.latents = latent
             if flip_aug:

From da6fea3d9779970a1c573bf26fe37c924efc68d8 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 19 May 2024 21:26:18 +0900
Subject: [PATCH 125/132] simplify and update alpha mask to work with various
 cases

---
 finetune/prepare_buckets_latents.py |  33 +++++--
 library/config_util.py              |   2 +
 library/custom_train_functions.py   |  15 ++-
 library/train_util.py               | 147 +++++++++++++++-------------
 sdxl_train.py                       |   6 +-
 tools/cache_latents.py              |  12 ++-
 train_db.py                         |   6 +-
 train_network.py                    |  10 +-
 train_textual_inversion.py          |   6 +-
 train_textual_inversion_XTI.py      |   6 +-
 10 files changed, 139 insertions(+), 104 deletions(-)

diff --git a/finetune/prepare_buckets_latents.py b/finetune/prepare_buckets_latents.py
index 0389da388..019c737a6 100644
--- a/finetune/prepare_buckets_latents.py
+++ b/finetune/prepare_buckets_latents.py
@@ -11,6 +11,7 @@
 
 import torch
 from library.device_utils import init_ipex, get_preferred_device
+
 init_ipex()
 
 from torchvision import transforms
@@ -18,8 +19,10 @@
 import library.model_util as model_util
 import library.train_util as train_util
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 DEVICE = get_preferred_device()
@@ -89,7 +92,9 @@ def main(args):
 
     # bucketのサイズを計算する
     max_reso = tuple([int(t) for t in args.max_resolution.split(",")])
-    assert len(max_reso) == 2, f"illegal resolution (not 'width,height') / 画像サイズに誤りがあります。'幅,高さ'で指定してください: {args.max_resolution}"
+    assert (
+        len(max_reso) == 2
+    ), f"illegal resolution (not 'width,height') / 画像サイズに誤りがあります。'幅,高さ'で指定してください: {args.max_resolution}"
 
     bucket_manager = train_util.BucketManager(
         args.bucket_no_upscale, max_reso, args.min_bucket_reso, args.max_bucket_reso, args.bucket_reso_steps
@@ -107,7 +112,7 @@ def main(args):
     def process_batch(is_last):
         for bucket in bucket_manager.buckets:
             if (is_last and len(bucket) > 0) or len(bucket) >= args.batch_size:
-                train_util.cache_batch_latents(vae, True, bucket, args.flip_aug, False)
+                train_util.cache_batch_latents(vae, True, bucket, args.flip_aug, args.alpha_mask, False)
                 bucket.clear()
 
     # 読み込みの高速化のためにDataLoaderを使うオプション
@@ -208,7 +213,9 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument("in_json", type=str, help="metadata file to input / 読み込むメタデータファイル")
     parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
     parser.add_argument("model_name_or_path", type=str, help="model name or path to encode latents / latentを取得するためのモデル")
-    parser.add_argument("--v2", action="store_true", help="not used (for backward compatibility) / 使用されません（互換性のため残してあります）")
+    parser.add_argument(
+        "--v2", action="store_true", help="not used (for backward compatibility) / 使用されません（互換性のため残してあります）"
+    )
     parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
     parser.add_argument(
         "--max_data_loader_n_workers",
@@ -231,10 +238,16 @@ def setup_parser() -> argparse.ArgumentParser:
         help="steps of resolution for buckets, divisible by 8 is recommended / bucketの解像度の単位、8で割り切れる値を推奨します",
     )
     parser.add_argument(
-        "--bucket_no_upscale", action="store_true", help="make bucket for each image without upscaling / 画像を拡大せずbucketを作成します"
+        "--bucket_no_upscale",
+        action="store_true",
+        help="make bucket for each image without upscaling / 画像を拡大せずbucketを作成します",
     )
     parser.add_argument(
-        "--mixed_precision", type=str, default="no", choices=["no", "fp16", "bf16"], help="use mixed precision / 混合精度を使う場合、その精度"
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help="use mixed precision / 混合精度を使う場合、その精度",
     )
     parser.add_argument(
         "--full_path",
@@ -242,7 +255,15 @@ def setup_parser() -> argparse.ArgumentParser:
         help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする（複数の学習画像ディレクトリに対応）",
     )
     parser.add_argument(
-        "--flip_aug", action="store_true", help="flip augmentation, save latents for flipped images / 左右反転した画像もlatentを取得、保存する"
+        "--flip_aug",
+        action="store_true",
+        help="flip augmentation, save latents for flipped images / 左右反転した画像もlatentを取得、保存する",
+    )
+    parser.add_argument(
+        "--alpha_mask",
+        type=str,
+        default="",
+        help="save alpha mask for images for loss calculation / 損失計算用に画像のアルファマスクを保存する",
     )
     parser.add_argument(
         "--skip_existing",
diff --git a/library/config_util.py b/library/config_util.py
index 82baab83e..964270dbb 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -214,11 +214,13 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
     DB_SUBSET_DISTINCT_SCHEMA = {
         Required("image_dir"): str,
         "is_reg": bool,
+        "alpha_mask": bool,
     }
     # FT means FineTuning
     FT_SUBSET_DISTINCT_SCHEMA = {
         Required("metadata_file"): str,
         "image_dir": str,
+        "alpha_mask": bool,
     }
     CN_SUBSET_ASCENDABLE_SCHEMA = {
         "caption_extension": str,
diff --git a/library/custom_train_functions.py b/library/custom_train_functions.py
index fad127405..af5813a1d 100644
--- a/library/custom_train_functions.py
+++ b/library/custom_train_functions.py
@@ -479,14 +479,19 @@ def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale):
     return noise
 
 
-def apply_masked_loss(loss, mask_image):
-    # mask image is -1 to 1. we need to convert it to 0 to 1
-    # mask_image = batch["conditioning_images"].to(dtype=loss.dtype)[:, 0].unsqueeze(1)  # use R channel
-    mask_image = mask_image.to(dtype=loss.dtype)
+def apply_masked_loss(loss, batch):
+    if "conditioning_images" in batch:
+        # conditioning image is -1 to 1. we need to convert it to 0 to 1
+        mask_image = batch["conditioning_images"].to(dtype=loss.dtype)[:, 0].unsqueeze(1)  # use R channel
+        mask_image = mask_image / 2 + 0.5
+    elif "alpha_masks" in batch and batch["alpha_masks"] is not None:
+        # alpha mask is 0 to 1
+        mask_image = batch["alpha_masks"].to(dtype=loss.dtype)
+    else:
+        return loss
 
     # resize to the same size as the loss
     mask_image = torch.nn.functional.interpolate(mask_image, size=loss.shape[2:], mode="area")
-    mask_image = mask_image / 2 + 0.5
     loss = loss * mask_image
     return loss
 
diff --git a/library/train_util.py b/library/train_util.py
index 6cf285903..e7a50f04d 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -159,9 +159,7 @@ def __init__(self, image_key: str, num_repeats: int, caption: str, is_reg: bool,
         self.text_encoder_outputs1: Optional[torch.Tensor] = None
         self.text_encoder_outputs2: Optional[torch.Tensor] = None
         self.text_encoder_pool2: Optional[torch.Tensor] = None
-        self.alpha_mask: Optional[torch.Tensor] = None
-        self.alpha_mask_flipped: Optional[torch.Tensor] = None
-        self.use_alpha_mask: bool = False
+        self.alpha_mask: Optional[torch.Tensor] = None  # alpha mask can be flipped in runtime
 
 
 class BucketManager:
@@ -364,6 +362,7 @@ class BaseSubset:
     def __init__(
         self,
         image_dir: Optional[str],
+        alpha_mask: Optional[bool],
         num_repeats: int,
         shuffle_caption: bool,
         caption_separator: str,
@@ -382,9 +381,9 @@ def __init__(
         caption_suffix: Optional[str],
         token_warmup_min: int,
         token_warmup_step: Union[float, int],
-        alpha_mask: bool,
     ) -> None:
         self.image_dir = image_dir
+        self.alpha_mask = alpha_mask if alpha_mask is not None else False
         self.num_repeats = num_repeats
         self.shuffle_caption = shuffle_caption
         self.caption_separator = caption_separator
@@ -407,8 +406,6 @@ def __init__(
 
         self.img_count = 0
 
-        self.alpha_mask = alpha_mask
-
 
 class DreamBoothSubset(BaseSubset):
     def __init__(
@@ -418,6 +415,7 @@ def __init__(
         class_tokens: Optional[str],
         caption_extension: str,
         cache_info: bool,
+        alpha_mask: bool,
         num_repeats,
         shuffle_caption,
         caption_separator: str,
@@ -441,6 +439,7 @@ def __init__(
 
         super().__init__(
             image_dir,
+            alpha_mask,
             num_repeats,
             shuffle_caption,
             caption_separator,
@@ -479,6 +478,7 @@ def __init__(
         self,
         image_dir,
         metadata_file: str,
+        alpha_mask: bool,
         num_repeats,
         shuffle_caption,
         caption_separator,
@@ -502,6 +502,7 @@ def __init__(
 
         super().__init__(
             image_dir,
+            alpha_mask,
             num_repeats,
             shuffle_caption,
             caption_separator,
@@ -921,7 +922,7 @@ def make_buckets(self):
             logger.info(f"mean ar error (without repeats): {mean_img_ar_error}")
 
         # データ参照用indexを作る。このindexはdatasetのshuffleに用いられる
-        self.buckets_indices: List(BucketBatchIndex) = []
+        self.buckets_indices: List[BucketBatchIndex] = []
         for bucket_index, bucket in enumerate(self.bucket_manager.buckets):
             batch_count = int(math.ceil(len(bucket) / self.batch_size))
             for batch_index in range(batch_count):
@@ -991,8 +992,6 @@ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_proc
         for info in tqdm(image_infos):
             subset = self.image_to_subset[info.image_key]
 
-            info.use_alpha_mask = subset.alpha_mask
-
             if info.latents_npz is not None:  # fine tuning dataset
                 continue
 
@@ -1002,7 +1001,9 @@ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_proc
                 if not is_main_process:  # store to info only
                     continue
 
-                cache_available = is_disk_cached_latents_is_expected(info.bucket_reso, info.latents_npz, subset.flip_aug)
+                cache_available = is_disk_cached_latents_is_expected(
+                    info.bucket_reso, info.latents_npz, subset.flip_aug, subset.alpha_mask
+                )
 
                 if cache_available:  # do not add to batch
                     continue
@@ -1028,7 +1029,7 @@ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=False, is_main_proc
         # iterate batches: batch doesn't have image, image will be loaded in cache_batch_latents and discarded
         logger.info("caching latents...")
         for batch in tqdm(batches, smoothing=1, total=len(batches)):
-            cache_batch_latents(vae, cache_to_disk, batch, subset.flip_aug, subset.random_crop)
+            cache_batch_latents(vae, cache_to_disk, batch, subset.flip_aug, subset.alpha_mask, subset.random_crop)
 
     # weight_dtypeを指定するとText Encoderそのもの、およひ出力がweight_dtypeになる
     # SDXLでのみ有効だが、datasetのメソッドとする必要があるので、sdxl_train_util.pyではなくこちらに実装する
@@ -1202,18 +1203,15 @@ def __getitem__(self, index):
                     alpha_mask = image_info.alpha_mask
                 else:
                     latents = image_info.latents_flipped
-                    alpha_mask = image_info.alpha_mask_flipped
+                    alpha_mask = None if image_info.alpha_mask is None else torch.flip(image_info.alpha_mask, [1])
 
                 image = None
             elif image_info.latents_npz is not None:  # FineTuningDatasetまたはcache_latents_to_disk=Trueの場合
-                latents, original_size, crop_ltrb, flipped_latents, alpha_mask, flipped_alpha_mask = load_latents_from_disk(
-                    image_info.latents_npz
-                )
+                latents, original_size, crop_ltrb, flipped_latents, alpha_mask = load_latents_from_disk(image_info.latents_npz)
                 if flipped:
                     latents = flipped_latents
-                    alpha_mask = flipped_alpha_mask
+                    alpha_mask = None if alpha_mask is None else alpha_mask[:, ::-1].copy()  # copy to avoid negative stride problem
                     del flipped_latents
-                    del flipped_alpha_mask
                 latents = torch.FloatTensor(latents)
                 if alpha_mask is not None:
                     alpha_mask = torch.FloatTensor(alpha_mask)
@@ -1255,23 +1253,28 @@ def __getitem__(self, index):
                 # augmentation
                 aug = self.aug_helper.get_augmentor(subset.color_aug)
                 if aug is not None:
-                    img = aug(image=img)["image"]
+                    # augment RGB channels only
+                    img_rgb = img[:, :, :3]
+                    img_rgb = aug(image=img_rgb)["image"]
+                    img[:, :, :3] = img_rgb
 
                 if flipped:
                     img = img[:, ::-1, :].copy()  # copy to avoid negative stride problem
 
                 if subset.alpha_mask:
                     if img.shape[2] == 4:
-                        alpha_mask = img[:, :, 3]  # [W,H]
+                        alpha_mask = img[:, :, 3]  # [H,W]
+                        alpha_mask = transforms.ToTensor()(alpha_mask)  # 0-255 -> 0-1
                     else:
-                        alpha_mask = np.full((im_w, im_h), 255, dtype=np.uint8)  # [W,H]
-                    alpha_mask = transforms.ToTensor()(alpha_mask)
+                        alpha_mask = torch.ones((img.shape[0], img.shape[1]), dtype=torch.float32)
                 else:
                     alpha_mask = None
+
                 img = img[:, :, :3]  # remove alpha channel
 
                 latents = None
                 image = self.image_transforms(img)  # -1.0~1.0のtorch.Tensorになる
+                del img
 
             images.append(image)
             latents_list.append(latents)
@@ -1361,6 +1364,23 @@ def __getitem__(self, index):
             example["text_encoder_outputs2_list"] = torch.stack(text_encoder_outputs2_list)
             example["text_encoder_pool2_list"] = torch.stack(text_encoder_pool2_list)
 
+        # if one of alpha_masks is not None, we need to replace None with ones
+        none_or_not = [x is None for x in alpha_mask_list]
+        if all(none_or_not):
+            example["alpha_masks"] = None
+        elif any(none_or_not):
+            for i in range(len(alpha_mask_list)):
+                if alpha_mask_list[i] is None:
+                    if images[i] is not None:
+                        alpha_mask_list[i] = torch.ones((images[i].shape[1], images[i].shape[2]), dtype=torch.float32)
+                    else:
+                        alpha_mask_list[i] = torch.ones(
+                            (latents_list[i].shape[1] * 8, latents_list[i].shape[2] * 8), dtype=torch.float32
+                        )
+            example["alpha_masks"] = torch.stack(alpha_mask_list)
+        else:
+            example["alpha_masks"] = torch.stack(alpha_mask_list)
+
         if images[0] is not None:
             images = torch.stack(images)
             images = images.to(memory_format=torch.contiguous_format).float()
@@ -1378,8 +1398,6 @@ def __getitem__(self, index):
 
         example["network_multipliers"] = torch.FloatTensor([self.network_multiplier] * len(captions))
 
-        example["alpha_mask"] = torch.stack(alpha_mask_list) if alpha_mask_list[0] is not None else None
-
         if self.debug_dataset:
             example["image_keys"] = bucket[image_index : image_index + self.batch_size]
         return example
@@ -1393,6 +1411,7 @@ def get_item_for_caching(self, bucket, bucket_batch_size, image_index):
         resized_sizes = []
         bucket_reso = None
         flip_aug = None
+        alpha_mask = None
         random_crop = None
 
         for image_key in bucket[image_index : image_index + bucket_batch_size]:
@@ -1401,10 +1420,13 @@ def get_item_for_caching(self, bucket, bucket_batch_size, image_index):
 
             if flip_aug is None:
                 flip_aug = subset.flip_aug
+                alpha_mask = subset.alpha_mask
                 random_crop = subset.random_crop
                 bucket_reso = image_info.bucket_reso
             else:
+                # TODO そもそも混在してても動くようにしたほうがいい
                 assert flip_aug == subset.flip_aug, "flip_aug must be same in a batch"
+                assert alpha_mask == subset.alpha_mask, "alpha_mask must be same in a batch"
                 assert random_crop == subset.random_crop, "random_crop must be same in a batch"
                 assert bucket_reso == image_info.bucket_reso, "bucket_reso must be same in a batch"
 
@@ -1441,6 +1463,7 @@ def get_item_for_caching(self, bucket, bucket_batch_size, image_index):
         example["absolute_paths"] = absolute_paths
         example["resized_sizes"] = resized_sizes
         example["flip_aug"] = flip_aug
+        example["alpha_mask"] = alpha_mask
         example["random_crop"] = random_crop
         example["bucket_reso"] = bucket_reso
         return example
@@ -2149,7 +2172,7 @@ def disable_token_padding(self):
             dataset.disable_token_padding()
 
 
-def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool):
+def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool, alpha_mask: bool):
     expected_latents_size = (reso[1] // 8, reso[0] // 8)  # bucket_resoはWxHなので注意
 
     if not os.path.exists(npz_path):
@@ -2167,6 +2190,12 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool):
                 return False
             if npz["latents_flipped"].shape[1:3] != expected_latents_size:
                 return False
+
+        if alpha_mask:
+            if "alpha_mask" not in npz:
+                return False
+            if npz["alpha_mask"].shape[0:2] != reso:  # HxW
+                return False
     except Exception as e:
         logger.error(f"Error loading file: {npz_path}")
         raise e
@@ -2177,14 +2206,7 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool):
 # 戻り値は、latents_tensor, (original_size width, original_size height), (crop left, crop top)
 def load_latents_from_disk(
     npz_path,
-) -> Tuple[
-    Optional[torch.Tensor],
-    Optional[List[int]],
-    Optional[List[int]],
-    Optional[torch.Tensor],
-    Optional[torch.Tensor],
-    Optional[torch.Tensor],
-]:
+) -> Tuple[Optional[torch.Tensor], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
     npz = np.load(npz_path)
     if "latents" not in npz:
         raise ValueError(f"error: npz is old format. please re-generate {npz_path}")
@@ -2194,20 +2216,15 @@ def load_latents_from_disk(
     crop_ltrb = npz["crop_ltrb"].tolist()
     flipped_latents = npz["latents_flipped"] if "latents_flipped" in npz else None
     alpha_mask = npz["alpha_mask"] if "alpha_mask" in npz else None
-    flipped_alpha_mask = npz["flipped_alpha_mask"] if "flipped_alpha_mask" in npz else None
-    return latents, original_size, crop_ltrb, flipped_latents, alpha_mask, flipped_alpha_mask
+    return latents, original_size, crop_ltrb, flipped_latents, alpha_mask
 
 
-def save_latents_to_disk(
-    npz_path, latents_tensor, original_size, crop_ltrb, flipped_latents_tensor=None, alpha_mask=None, flipped_alpha_mask=None
-):
+def save_latents_to_disk(npz_path, latents_tensor, original_size, crop_ltrb, flipped_latents_tensor=None, alpha_mask=None):
     kwargs = {}
     if flipped_latents_tensor is not None:
         kwargs["latents_flipped"] = flipped_latents_tensor.float().cpu().numpy()
     if alpha_mask is not None:
-        kwargs["alpha_mask"] = alpha_mask.float().cpu().numpy()
-    if flipped_alpha_mask is not None:
-        kwargs["flipped_alpha_mask"] = flipped_alpha_mask.float().cpu().numpy()
+        kwargs["alpha_mask"] = alpha_mask  # ndarray
     np.savez(
         npz_path,
         latents=latents_tensor.float().cpu().numpy(),
@@ -2398,10 +2415,11 @@ def load_arbitrary_dataset(args, tokenizer) -> MinimalDataset:
 
 def load_image(image_path, alpha=False):
     image = Image.open(image_path)
-    if not image.mode == "RGB":
-        if alpha:
+    if alpha:
+        if not image.mode == "RGBA":
             image = image.convert("RGBA")
-        else:
+    else:
+        if not image.mode == "RGB":
             image = image.convert("RGB")
     img = np.array(image, np.uint8)
     return img
@@ -2441,7 +2459,7 @@ def trim_and_resize_if_required(
 
 
 def cache_batch_latents(
-    vae: AutoencoderKL, cache_to_disk: bool, image_infos: List[ImageInfo], flip_aug: bool, random_crop: bool
+    vae: AutoencoderKL, cache_to_disk: bool, image_infos: List[ImageInfo], flip_aug: bool, use_alpha_mask: bool, random_crop: bool
 ) -> None:
     r"""
     requires image_infos to have: absolute_path, bucket_reso, resized_size, latents_npz
@@ -2453,49 +2471,43 @@ def cache_batch_latents(
     latents_original_size and latents_crop_ltrb are also set
     """
     images = []
-    alpha_masks = []
+    alpha_masks: List[np.ndarray] = []
     for info in image_infos:
-        image = load_image(info.absolute_path, info.use_alpha_mask) if info.image is None else np.array(info.image, np.uint8)
+        image = load_image(info.absolute_path, use_alpha_mask) if info.image is None else np.array(info.image, np.uint8)
         # TODO 画像のメタデータが壊れていて、メタデータから割り当てたbucketと実際の画像サイズが一致しない場合があるのでチェック追加要
         image, original_size, crop_ltrb = trim_and_resize_if_required(random_crop, image, info.bucket_reso, info.resized_size)
-        if info.use_alpha_mask:
+
+        info.latents_original_size = original_size
+        info.latents_crop_ltrb = crop_ltrb
+
+        if use_alpha_mask:
             if image.shape[2] == 4:
-                alpha_mask = image[:, :, 3]  # [W,H]
-                image = image[:, :, :3]
+                alpha_mask = image[:, :, 3]  # [H,W]
+                alpha_mask = alpha_mask.astype(np.float32) / 255.0
             else:
-                alpha_mask = np.full_like(image[:, :, 0], 255, dtype=np.uint8)  # [W,H]
-            alpha_masks.append(transforms.ToTensor()(alpha_mask))
+                alpha_mask = np.ones_like(image[:, :, 0], dtype=np.float32)
+        else:
+            alpha_mask = None
+        alpha_masks.append(alpha_mask)
+
+        image = image[:, :, :3]  # remove alpha channel if exists
         image = IMAGE_TRANSFORMS(image)
         images.append(image)
 
-        info.latents_original_size = original_size
-        info.latents_crop_ltrb = crop_ltrb
-
     img_tensors = torch.stack(images, dim=0)
     img_tensors = img_tensors.to(device=vae.device, dtype=vae.dtype)
 
     with torch.no_grad():
         latents = vae.encode(img_tensors).latent_dist.sample().to("cpu")
 
-    if info.use_alpha_mask:
-        alpha_masks = torch.stack(alpha_masks, dim=0).to("cpu")
-    else:
-        alpha_masks = [None] * len(image_infos)
-        flipped_alpha_masks = [None] * len(image_infos)
-
     if flip_aug:
         img_tensors = torch.flip(img_tensors, dims=[3])
         with torch.no_grad():
             flipped_latents = vae.encode(img_tensors).latent_dist.sample().to("cpu")
-        if info.use_alpha_mask:
-            flipped_alpha_masks = torch.flip(alpha_masks, dims=[3])
     else:
         flipped_latents = [None] * len(latents)
-        flipped_alpha_masks = [None] * len(image_infos)
 
-    for info, latent, flipped_latent, alpha_mask, flipped_alpha_mask in zip(
-        image_infos, latents, flipped_latents, alpha_masks, flipped_alpha_masks
-    ):
+    for info, latent, flipped_latent, alpha_mask in zip(image_infos, latents, flipped_latents, alpha_masks):
         # check NaN
         if torch.isnan(latents).any() or (flipped_latent is not None and torch.isnan(flipped_latent).any()):
             raise RuntimeError(f"NaN detected in latents: {info.absolute_path}")
@@ -2508,15 +2520,12 @@ def cache_batch_latents(
                 info.latents_crop_ltrb,
                 flipped_latent,
                 alpha_mask,
-                flipped_alpha_mask,
             )
         else:
             info.latents = latent
             if flip_aug:
                 info.latents_flipped = flipped_latent
-
             info.alpha_mask = alpha_mask
-            info.alpha_mask_flipped = flipped_alpha_mask
 
     if not HIGH_VRAM:
         clean_memory_on_device(vae.device)
diff --git a/sdxl_train.py b/sdxl_train.py
index dcd06766b..9e20c60ca 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -711,10 +711,8 @@ def optimizer_hook(parameter: torch.Tensor):
                     loss = train_util.conditional_loss(
                         noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
                     )
-                    if args.masked_loss:
-                        loss = apply_masked_loss(loss, batch["conditioning_images"][:, 0].unsqueeze(1))
-                    if "alpha_mask" in batch and batch["alpha_mask"] is not None:
-                        loss = apply_masked_loss(loss, batch["alpha_mask"])
+                    if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                        loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
 
                     if args.min_snr_gamma:
diff --git a/tools/cache_latents.py b/tools/cache_latents.py
index 347db27f7..b7c88121e 100644
--- a/tools/cache_latents.py
+++ b/tools/cache_latents.py
@@ -17,10 +17,13 @@
     BlueprintGenerator,
 )
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
+
 def cache_to_disk(args: argparse.Namespace) -> None:
     train_util.prepare_dataset_args(args, True)
 
@@ -107,7 +110,7 @@ def cache_to_disk(args: argparse.Namespace) -> None:
     else:
         _, vae, _, _ = train_util.load_target_model(args, weight_dtype, accelerator)
 
-    if torch.__version__ >= "2.0.0": # PyTorch 2.0.0 以上対応のxformersなら以下が使える
+    if torch.__version__ >= "2.0.0":  # PyTorch 2.0.0 以上対応のxformersなら以下が使える
         vae.set_use_memory_efficient_attention_xformers(args.xformers)
     vae.to(accelerator.device, dtype=vae_dtype)
     vae.requires_grad_(False)
@@ -136,6 +139,7 @@ def cache_to_disk(args: argparse.Namespace) -> None:
         b_size = len(batch["images"])
         vae_batch_size = b_size if args.vae_batch_size is None else args.vae_batch_size
         flip_aug = batch["flip_aug"]
+        alpha_mask = batch["alpha_mask"]
         random_crop = batch["random_crop"]
         bucket_reso = batch["bucket_reso"]
 
@@ -154,14 +158,16 @@ def cache_to_disk(args: argparse.Namespace) -> None:
                 image_info.latents_npz = os.path.splitext(absolute_path)[0] + ".npz"
 
                 if args.skip_existing:
-                    if train_util.is_disk_cached_latents_is_expected(image_info.bucket_reso, image_info.latents_npz, flip_aug):
+                    if train_util.is_disk_cached_latents_is_expected(
+                        image_info.bucket_reso, image_info.latents_npz, flip_aug, alpha_mask
+                    ):
                         logger.warning(f"Skipping {image_info.latents_npz} because it already exists.")
                         continue
 
                 image_infos.append(image_info)
 
             if len(image_infos) > 0:
-                train_util.cache_batch_latents(vae, True, image_infos, flip_aug, random_crop)
+                train_util.cache_batch_latents(vae, True, image_infos, flip_aug, alpha_mask, random_crop)
 
     accelerator.wait_for_everyone()
     accelerator.print(f"Finished caching latents for {len(train_dataset_group)} batches.")
diff --git a/train_db.py b/train_db.py
index c46900006..39d8ea6ed 100644
--- a/train_db.py
+++ b/train_db.py
@@ -359,10 +359,8 @@ def train(args):
                     target = noise
 
                 loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
-                if args.masked_loss:
-                    loss = apply_masked_loss(loss, batch["conditioning_images"][:, 0].unsqueeze(1))
-                if "alpha_mask" in batch and batch["alpha_mask"] is not None:
-                    loss = apply_masked_loss(loss, batch["alpha_mask"])
+                if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                    loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_network.py b/train_network.py
index cd1677ad2..b272a6e1a 100644
--- a/train_network.py
+++ b/train_network.py
@@ -774,7 +774,9 @@ def load_model_hook(models, input_dir):
             if args.log_tracker_config is not None:
                 init_kwargs = toml.load(args.log_tracker_config)
             accelerator.init_trackers(
-                "network_train" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs
+                "network_train" if args.log_tracker_name is None else args.log_tracker_name,
+                config=train_util.get_sanitized_config_or_none(args),
+                init_kwargs=init_kwargs,
             )
 
         loss_recorder = train_util.LossRecorder()
@@ -902,10 +904,8 @@ def remove_model(old_ckpt_name):
                     loss = train_util.conditional_loss(
                         noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
                     )
-                    if args.masked_loss:
-                        loss = apply_masked_loss(loss, batch["conditioning_images"][:, 0].unsqueeze(1))
-                    if "alpha_mask" in batch and batch["alpha_mask"] is not None:
-                        loss = apply_masked_loss(loss, batch["alpha_mask"])
+                    if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                        loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
 
                     loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index a9c2a1094..ade077c36 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -589,10 +589,8 @@ def remove_model(old_ckpt_name):
                         target = noise
 
                     loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
-                    if args.masked_loss:
-                        loss = apply_masked_loss(loss, batch["conditioning_images"][:, 0].unsqueeze(1))
-                    if "alpha_mask" in batch and batch["alpha_mask"] is not None:
-                        loss = apply_masked_loss(loss, batch["alpha_mask"])
+                    if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                        loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
 
                     loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 959839cbb..efb59137b 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -474,10 +474,8 @@ def remove_model(old_ckpt_name):
                     target = noise
 
                 loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
-                if args.masked_loss:
-                    loss = apply_masked_loss(loss, batch["conditioning_images"][:, 0].unsqueeze(1))
-                if "alpha_mask" in batch and batch["alpha_mask"] is not None:
-                    loss = apply_masked_loss(loss, batch["alpha_mask"])
+                if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                    loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight

From 00513b9b7066fc1307fbe26ad13ed39f3bceceb0 Mon Sep 17 00:00:00 2001
From: rockerBOO <rockerboo@gmail.com>
Date: Thu, 23 May 2024 22:27:12 -0400
Subject: [PATCH 126/132] Add LoRA+ LR Ratio info message to logger

---
 networks/dylora.py | 3 +++
 networks/lora.py   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/networks/dylora.py b/networks/dylora.py
index d57e3d580..b0925453c 100644
--- a/networks/dylora.py
+++ b/networks/dylora.py
@@ -368,6 +368,9 @@ def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, lorap
         self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
         self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
 
+        logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio}")
+        logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
+
     def set_multiplier(self, multiplier):
         self.multiplier = multiplier
         for lora in self.text_encoder_loras + self.unet_loras:
diff --git a/networks/lora.py b/networks/lora.py
index 9f159f5db..82b8b5b47 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -1134,6 +1134,9 @@ def set_loraplus_lr_ratio(self, loraplus_lr_ratio, loraplus_unet_lr_ratio, lorap
         self.loraplus_unet_lr_ratio = loraplus_unet_lr_ratio
         self.loraplus_text_encoder_lr_ratio = loraplus_text_encoder_lr_ratio
 
+        logger.info(f"LoRA+ UNet LR Ratio: {self.loraplus_unet_lr_ratio or self.loraplus_lr_ratio}")
+        logger.info(f"LoRA+ Text Encoder LR Ratio: {self.loraplus_text_encoder_lr_ratio or self.loraplus_lr_ratio}")
+
     # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
     def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
         # TODO warn if optimizer is not compatible with LoRA+ (but it will cause error so we don't need to check it here?)

From e8cfd4ba1d4734c4dd37c9b5fdc0633378879d9b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 26 May 2024 22:01:37 +0900
Subject: [PATCH 127/132] fix to work cond mask and alpha mask

---
 library/config_util.py            |  3 ++-
 library/custom_train_functions.py |  4 +++-
 library/train_util.py             | 12 ++++++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/library/config_util.py b/library/config_util.py
index 964270dbb..10b2457f3 100644
--- a/library/config_util.py
+++ b/library/config_util.py
@@ -78,7 +78,6 @@ class BaseSubsetParams:
     caption_tag_dropout_rate: float = 0.0
     token_warmup_min: int = 1
     token_warmup_step: float = 0
-    alpha_mask: bool = False
 
 
 @dataclass
@@ -87,11 +86,13 @@ class DreamBoothSubsetParams(BaseSubsetParams):
     class_tokens: Optional[str] = None
     caption_extension: str = ".caption"
     cache_info: bool = False
+    alpha_mask: bool = False
 
 
 @dataclass
 class FineTuningSubsetParams(BaseSubsetParams):
     metadata_file: Optional[str] = None
+    alpha_mask: bool = False
 
 
 @dataclass
diff --git a/library/custom_train_functions.py b/library/custom_train_functions.py
index af5813a1d..2a513dc5b 100644
--- a/library/custom_train_functions.py
+++ b/library/custom_train_functions.py
@@ -484,9 +484,11 @@ def apply_masked_loss(loss, batch):
         # conditioning image is -1 to 1. we need to convert it to 0 to 1
         mask_image = batch["conditioning_images"].to(dtype=loss.dtype)[:, 0].unsqueeze(1)  # use R channel
         mask_image = mask_image / 2 + 0.5
+        # print(f"conditioning_image: {mask_image.shape}")
     elif "alpha_masks" in batch and batch["alpha_masks"] is not None:
         # alpha mask is 0 to 1
-        mask_image = batch["alpha_masks"].to(dtype=loss.dtype)
+        mask_image = batch["alpha_masks"].to(dtype=loss.dtype).unsqueeze(1) # add channel dimension
+        # print(f"mask_image: {mask_image.shape}, {mask_image.mean()}")
     else:
         return loss
 
diff --git a/library/train_util.py b/library/train_util.py
index e7a50f04d..1f9f3c5df 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -561,6 +561,7 @@ def __init__(
 
         super().__init__(
             image_dir,
+            False,  # alpha_mask
             num_repeats,
             shuffle_caption,
             caption_separator,
@@ -1947,6 +1948,7 @@ def __init__(
                 None,
                 subset.caption_extension,
                 subset.cache_info,
+                False,
                 subset.num_repeats,
                 subset.shuffle_caption,
                 subset.caption_separator,
@@ -2196,6 +2198,9 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool, alph
                 return False
             if npz["alpha_mask"].shape[0:2] != reso:  # HxW
                 return False
+        else:
+            if "alpha_mask" in npz:
+                return False
     except Exception as e:
         logger.error(f"Error loading file: {npz_path}")
         raise e
@@ -2296,6 +2301,13 @@ def debug_dataset(train_dataset, show_input_ids=False):
                         if os.name == "nt":
                             cv2.imshow("cond_img", cond_img)
 
+                    if "alpha_masks" in example and example["alpha_masks"] is not None:
+                        alpha_mask = example["alpha_masks"][j]
+                        logger.info(f"alpha mask size: {alpha_mask.size()}")
+                        alpha_mask = (alpha_mask[0].numpy() * 255.0).astype(np.uint8)
+                        if os.name == "nt":
+                            cv2.imshow("alpha_mask", alpha_mask)
+
                     if os.name == "nt":  # only windows
                         cv2.imshow("img", im)
                         k = cv2.waitKey()

From d50c1b3c5cfd590e43e832272a77bf8c84d371dd Mon Sep 17 00:00:00 2001
From: Dave Lage <rockerboo@gmail.com>
Date: Mon, 27 May 2024 01:11:01 -0400
Subject: [PATCH 128/132] Update issue link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 838e4022c..23e049354 100644
--- a/README.md
+++ b/README.md
@@ -237,7 +237,7 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
 https://github.com/kohya-ss/sd-scripts/pull/1291) issue [#1290](
 https://github.com/kohya-ss/sd-scripts/pull/1290) frodo821 氏に感謝します。
 
-- データセット設定の .toml ファイルで、`caption_separator` が subset に指定できない不具合が修正されました。 PR [#1312](https://github.com/kohya-ss/sd-scripts/pull/1312) および [#1313](https://github.com/kohya-ss/sd-scripts/pull/1312) rockerBOO 氏に感謝します。
+- データセット設定の .toml ファイルで、`caption_separator` が subset に指定できない不具合が修正されました。 PR [#1312](https://github.com/kohya-ss/sd-scripts/pull/1312) および [#1313](https://github.com/kohya-ss/sd-scripts/pull/1313) rockerBOO 氏に感謝します。
 
 - ControlNet-LLLite 学習時の潜在バグが修正されました。 PR [#1322](https://github.com/kohya-ss/sd-scripts/pull/1322) aria1th 氏に感謝します。
 

From a4c3155148e667f5235c2e3df52bad7fd8f95dc4 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 27 May 2024 20:59:40 +0900
Subject: [PATCH 129/132] add doc for mask loss

---
 docs/masked_loss_README-ja.md | 40 +++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 docs/masked_loss_README-ja.md

diff --git a/docs/masked_loss_README-ja.md b/docs/masked_loss_README-ja.md
new file mode 100644
index 000000000..860532247
--- /dev/null
+++ b/docs/masked_loss_README-ja.md
@@ -0,0 +1,40 @@
+## マスクロスについて
+
+マスクロスは、入力画像のマスクで指定された部分だけ損失計算することで、画像の一部分だけを学習することができる機能です。
+たとえばキャラクタを学習したい場合、キャラクタ部分だけをマスクして学習することで、背景を無視して学習することができます。
+
+マスクロスのマスクには、二種類の指定方法があります。
+
+- マスク画像を用いる方法
+- 透明度（アルファチャネル）を使用する方法
+
+なお、サンプルは [ずんずんPJイラスト/3Dデータ](https://zunko.jp/con_illust.html) の「AI画像モデル用学習データ」を使用しています。
+
+### マスク画像を用いる方法
+
+学習画像それぞれに対応するマスク画像を用意する方法です。学習画像と同じファイル名のマスク画像を用意し、それを学習画像と別のディレクトリに保存します。
+
+マスク画像は、学習画像と同じサイズで、学習する部分を白、無視する部分を黒で描画します。グレースケールにも対応しています（127 ならロス重みが 0.5 になります）。なお、正確にはマスク画像の R チャネルが用いられます。
+
+DreamBooth 方式の dataset で、`conditioning_data_dir` で指定したディレクトリにマスク画像を保存するしてください。ControlNet のデータセットと同じですので、詳細は [ControlNet-LLLite](train_lllite_README-ja.md#データセットの準備) を参照してください。
+
+### 透明度（アルファチャネル）を使用する方法
+
+学習画像の透明度（アルファチャネル）がマスクとして使用されます。透明度が 0 の部分は無視され、255 の部分は学習されます。半透明の場合は、その透明度に応じてロス重みが変化します（127 ならおおむね 0.5）。
+
+学習時のスクリプトのオプション `--alpha_mask`、または dataset の設定ファイルの subset で、`alpha_mask` を指定してください。たとえば、以下のようになります。
+
+```toml
+[[datasets.subsets]]
+image_dir = "/path/to/image/dir"
+caption_extension = ".txt"
+num_repeats = 8
+alpha_mask = true
+```
+
+## 学習時の注意事項
+
+- 現時点では DreamBooth 方式の dataset のみ対応しています。
+- マスクは latents のサイズ、つまり 1/8 に縮小されてから適用されます。そのため、細かい部分（たとえばアホ毛やイヤリングなど）はうまく学習できない可能性があります。マスクをわずかに拡張するなどの工夫が必要かもしれません。
+- マスクロスを用いる場合、学習対象外の部分をキャプションに含める必要はないかもしれません。（要検証）
+- `alpha_mask` の場合、マスクの有無を切り替えると latents キャッシュが自動的に再生成されます。

From 71ad3c0f45ba64bd5dc069addc8ef0fa94bf4e19 Mon Sep 17 00:00:00 2001
From: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Date: Mon, 27 May 2024 21:07:57 +0900
Subject: [PATCH 130/132] Update masked_loss_README-ja.md

add sample images
---
 docs/masked_loss_README-ja.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/docs/masked_loss_README-ja.md b/docs/masked_loss_README-ja.md
index 860532247..5377a5aff 100644
--- a/docs/masked_loss_README-ja.md
+++ b/docs/masked_loss_README-ja.md
@@ -14,6 +14,11 @@
 
 学習画像それぞれに対応するマスク画像を用意する方法です。学習画像と同じファイル名のマスク画像を用意し、それを学習画像と別のディレクトリに保存します。
 
+- 学習画像
+  ![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/607c5116-5f62-47de-8b66-9c4a597f0441)
+- マスク画像
+  ![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/53e9b0f8-a4bf-49ed-882d-4026f84e8450)
+
 マスク画像は、学習画像と同じサイズで、学習する部分を白、無視する部分を黒で描画します。グレースケールにも対応しています（127 ならロス重みが 0.5 になります）。なお、正確にはマスク画像の R チャネルが用いられます。
 
 DreamBooth 方式の dataset で、`conditioning_data_dir` で指定したディレクトリにマスク画像を保存するしてください。ControlNet のデータセットと同じですので、詳細は [ControlNet-LLLite](train_lllite_README-ja.md#データセットの準備) を参照してください。
@@ -22,7 +27,11 @@ DreamBooth 方式の dataset で、`conditioning_data_dir` で指定したディ
 
 学習画像の透明度（アルファチャネル）がマスクとして使用されます。透明度が 0 の部分は無視され、255 の部分は学習されます。半透明の場合は、その透明度に応じてロス重みが変化します（127 ならおおむね 0.5）。
 
-学習時のスクリプトのオプション `--alpha_mask`、または dataset の設定ファイルの subset で、`alpha_mask` を指定してください。たとえば、以下のようになります。
+![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/0baa129b-446a-4aac-b98c-7208efb0e75e)
+
+※それぞれの画像は透過PNG
+
+学習時のスクリプトのオプションに `--alpha_mask` を指定するか、dataset の設定ファイルの subset で、`alpha_mask` を指定してください。たとえば、以下のようになります。
 
 ```toml
 [[datasets.subsets]]

From fc85496f7e99b2bbbbd0246e0b0521780c55d859 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 27 May 2024 21:25:06 +0900
Subject: [PATCH 131/132] update docs for masked loss

---
 README.md                     |  8 +++++
 docs/masked_loss_README-ja.md | 10 ++++++-
 docs/masked_loss_README.md    | 56 +++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 docs/masked_loss_README.md

diff --git a/README.md b/README.md
index 23e049354..52c963392 100644
--- a/README.md
+++ b/README.md
@@ -161,6 +161,10 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
     - Example: `--network_args "loraplus_unet_lr_ratio=16" "loraplus_text_encoder_lr_ratio=4"` or `--network_args "loraplus_lr_ratio=16" "loraplus_text_encoder_lr_ratio=4"` etc.
   - `network_module` `networks.lora` and `networks.dylora` are available.
 
+- The feature to use the transparency (alpha channel) of the image as a mask in the loss calculation has been added. PR [#1223](https://github.com/kohya-ss/sd-scripts/pull/1223) Thanks to u-haru!
+  - The transparent part is ignored during training. Specify the `--alpha_mask` option in the training script or specify `alpha_mask = true` in the dataset configuration file.
+  - See [About masked loss](./docs/masked_loss_README.md) for details.
+
 - LoRA training in SDXL now supports block-wise learning rates and block-wise dim (rank). PR [#1331](https://github.com/kohya-ss/sd-scripts/pull/1331) 
   - Specify the learning rate and dim (rank) for each block.
   - See [Block-wise learning rates in LoRA](./docs/train_network_README-ja.md#階層別学習率) for details (Japanese only).
@@ -214,6 +218,10 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
     - 例：`--network_args "loraplus_unet_lr_ratio=16" "loraplus_text_encoder_lr_ratio=4"` または `--network_args "loraplus_lr_ratio=16" "loraplus_text_encoder_lr_ratio=4"` など
   - `network_module` の `networks.lora` および `networks.dylora` で使用可能です。
 
+- 画像の透明度（アルファチャネル）をロス計算時のマスクとして使用する機能が追加されました。PR [#1223](https://github.com/kohya-ss/sd-scripts/pull/1223) u-haru 氏に感謝します。
+  - 透明部分が学習時に無視されるようになります。学習スクリプトに `--alpha_mask` オプションを指定するか、データセット設定ファイルに `alpha_mask = true` を指定してください。
+  - 詳細は [マスクロスについて](./docs/masked_loss_README-ja.md) をご覧ください。
+
 - SDXL の LoRA で階層別学習率、階層別 dim (rank) をサポートしました。PR [#1331](https://github.com/kohya-ss/sd-scripts/pull/1331) 
   - ブロックごとに学習率および dim (rank) を指定することができます。
   - 詳細は [LoRA の階層別学習率](./docs/train_network_README-ja.md#階層別学習率) をご覧ください。
diff --git a/docs/masked_loss_README-ja.md b/docs/masked_loss_README-ja.md
index 5377a5aff..58f042c3b 100644
--- a/docs/masked_loss_README-ja.md
+++ b/docs/masked_loss_README-ja.md
@@ -19,9 +19,17 @@
 - マスク画像
   ![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/53e9b0f8-a4bf-49ed-882d-4026f84e8450)
 
+```.toml
+[[datasets.subsets]]
+image_dir = "/path/to/a_zundamon"
+caption_extension = ".txt"
+conditioning_data_dir = "/path/to/a_zundamon_mask"
+num_repeats = 8
+```
+
 マスク画像は、学習画像と同じサイズで、学習する部分を白、無視する部分を黒で描画します。グレースケールにも対応しています（127 ならロス重みが 0.5 になります）。なお、正確にはマスク画像の R チャネルが用いられます。
 
-DreamBooth 方式の dataset で、`conditioning_data_dir` で指定したディレクトリにマスク画像を保存するしてください。ControlNet のデータセットと同じですので、詳細は [ControlNet-LLLite](train_lllite_README-ja.md#データセットの準備) を参照してください。
+DreamBooth 方式の dataset で、`conditioning_data_dir` で指定したディレクトリにマスク画像を保存してください。ControlNet のデータセットと同じですので、詳細は [ControlNet-LLLite](train_lllite_README-ja.md#データセットの準備) を参照してください。
 
 ### 透明度（アルファチャネル）を使用する方法
 
diff --git a/docs/masked_loss_README.md b/docs/masked_loss_README.md
new file mode 100644
index 000000000..3ac5ad211
--- /dev/null
+++ b/docs/masked_loss_README.md
@@ -0,0 +1,56 @@
+## Masked Loss
+
+Masked loss is a feature that allows you to train only part of an image by calculating the loss only for the part specified by the mask of the input image. For example, if you want to train a character, you can train only the character part by masking it, ignoring the background.
+
+There are two ways to specify the mask for masked loss.
+
+- Using a mask image
+- Using transparency (alpha channel) of the image
+
+The sample uses the "AI image model training data" from [ZunZunPJ Illustration/3D Data](https://zunko.jp/con_illust.html).
+
+### Using a mask image
+
+This is a method of preparing a mask image corresponding to each training image. Prepare a mask image with the same file name as the training image and save it in a different directory from the training image.
+
+- Training image
+  ![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/607c5116-5f62-47de-8b66-9c4a597f0441)
+- Mask image
+  ![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/53e9b0f8-a4bf-49ed-882d-4026f84e8450)
+
+```.toml
+[[datasets.subsets]]
+image_dir = "/path/to/a_zundamon"
+caption_extension = ".txt"
+conditioning_data_dir = "/path/to/a_zundamon_mask"
+num_repeats = 8
+```
+
+The mask image is the same size as the training image, with the part to be trained drawn in white and the part to be ignored in black. It also supports grayscale (127 gives a loss weight of 0.5). The R channel of the mask image is used currently.
+
+Use the dataset in the DreamBooth method, and save the mask image in the directory specified by `conditioning_data_dir`. It is the same as the ControlNet dataset, so please refer to [ControlNet-LLLite](train_lllite_README.md#Preparing-the-dataset) for details.
+
+### Using transparency (alpha channel) of the image
+
+The transparency (alpha channel) of the training image is used as a mask. The part with transparency 0 is ignored, the part with transparency 255 is trained. For semi-transparent parts, the loss weight changes according to the transparency (127 gives a weight of about 0.5).
+
+![image](https://github.com/kohya-ss/sd-scripts/assets/52813779/0baa129b-446a-4aac-b98c-7208efb0e75e)
+
+※Each image is a transparent PNG
+
+Specify `--alpha_mask` in the training script options or specify `alpha_mask` in the subset of the dataset configuration file. For example, it will look like this.
+
+```toml
+[[datasets.subsets]]
+image_dir = "/path/to/image/dir"
+caption_extension = ".txt"
+num_repeats = 8
+alpha_mask = true
+```
+
+## Notes on training
+
+- At the moment, only the dataset in the DreamBooth method is supported.
+- The mask is applied after the size is reduced to 1/8, which is the size of the latents. Therefore, fine details (such as ahoge or earrings) may not be learned well. Some dilations of the mask may be necessary.
+- If using masked loss, it may not be necessary to include parts that are not to be trained in the caption. (To be verified)
+- In the case of `alpha_mask`, the latents cache is automatically regenerated when the enable/disable state of the mask is switched.

From 645c974c4b4c18f9be7c703ad797dea19b2d1622 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Thu, 1 Aug 2024 00:38:48 +0800
Subject: [PATCH 132/132] Dev (#10)

* Final implementation

* Skip the final 1 step

* fix alpha mask without disk cache closes #1351, ref #1339

* update for corner cases

* Bump crate-ci/typos from 1.19.0 to 1.21.0, fix typos, and updated _typos.toml (Close #1307)

* set static graph flag when DDP ref #1363

* make forward/backward pathes same ref #1363

* update README

* add grad_hook after restore state closes #1344

* fix to work cache_latents/text_encoder_outputs

* show file name if error in load_image ref #1385

---------

Co-authored-by: Kohya S <ykumeykume@gmail.com>
Co-authored-by: Kohya S <52813779+kohya-ss@users.noreply.github.com>
Co-authored-by: Yuta Hayashibe <yuta@hayashibe.jp>
---
 .github/workflows/typos.yml              |   2 +-
 README.md                                |  18 ++++
 _typos.toml                              |   2 +
 library/ipex/attention.py                |   2 +-
 library/train_util.py                    |  48 +++++++----
 networks/control_net_lllite_for_train.py |  12 +--
 sdxl_train.py                            |  46 +++++-----
 sdxl_train_control_net_lllite.py         |   3 +
 tools/cache_latents.py                   |   6 +-
 tools/cache_text_encoder_outputs.py      |   5 +-
 train_network.py                         | 105 ++++++++++++++++++++++-
 11 files changed, 196 insertions(+), 53 deletions(-)

diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
index e8b06483f..c81ff3210 100644
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -18,4 +18,4 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: typos-action
-        uses: crate-ci/typos@v1.19.0
+        uses: crate-ci/typos@v1.21.0
diff --git a/README.md b/README.md
index 52c963392..946df58f3 100644
--- a/README.md
+++ b/README.md
@@ -178,6 +178,12 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 - The ControlNet training script `train_controlnet.py` for SD1.5/2.x was not working, but it has been fixed. PR [#1284](https://github.com/kohya-ss/sd-scripts/pull/1284) Thanks to sdbds!
 
+- `train_network.py` and `sdxl_train_network.py` now restore the order/position of data loading from DataSet when resuming training. PR [#1353](https://github.com/kohya-ss/sd-scripts/pull/1353) [#1359](https://github.com/kohya-ss/sd-scripts/pull/1359) Thanks to KohakuBlueleaf!
+  - This resolves the issue where the order of data loading from DataSet changes when resuming training.
+  - Specify the `--skip_until_initial_step` option to skip data loading until the specified step. If not specified, data loading starts from the beginning of the DataSet (same as before).
+  - If `--resume` is specified, the step saved in the state is used.
+  - Specify the `--initial_step` or `--initial_epoch` option to skip data loading until the specified step or epoch. Use these options in conjunction with `--skip_until_initial_step`. These options can be used without `--resume` (use them when resuming training with `--network_weights`).
+
 - An option `--disable_mmap_load_safetensors` is added to disable memory mapping when loading the model's .safetensors in SDXL. PR [#1266](https://github.com/kohya-ss/sd-scripts/pull/1266) Thanks to Zovjsra!
   - It seems that the model file loading is faster in the WSL environment etc.
   - Available in `sdxl_train.py`, `sdxl_train_network.py`, `sdxl_train_textual_inversion.py`, and `sdxl_train_control_net_lllite.py`.
@@ -235,6 +241,12 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) Thanks to frodo821!
 
 - SD1.5/2.x 用の ControlNet 学習スクリプト `train_controlnet.py` が動作しなくなっていたのが修正されました。PR [#1284](https://github.com/kohya-ss/sd-scripts/pull/1284) sdbds 氏に感謝します。
 
+- `train_network.py` および `sdxl_train_network.py` で、学習再開時に DataSet の読み込み順についても復元できるようになりました。PR [#1353](https://github.com/kohya-ss/sd-scripts/pull/1353) [#1359](https://github.com/kohya-ss/sd-scripts/pull/1359) KohakuBlueleaf 氏に感謝します。
+  - これにより、学習再開時に DataSet の読み込み順が変わってしまう問題が解消されます。
+  - `--skip_until_initial_step` オプションを指定すると、指定したステップまで DataSet 読み込みをスキップします。指定しない場合の動作は変わりません（DataSet の最初から読み込みます）
+  - `--resume` オプションを指定すると、state に保存されたステップ数が使用されます。
+  - `--initial_step` または `--initial_epoch` オプションを指定すると、指定したステップまたはエポックまで DataSet 読み込みをスキップします。これらのオプションは `--skip_until_initial_step` と併用してください。またこれらのオプションは `--resume` と併用しなくても使えます（`--network_weights` を用いた学習再開時などにお使いください ）。
+
 - SDXL でモデルの .safetensors を読み込む際にメモリマッピングを無効化するオプション `--disable_mmap_load_safetensors` が追加されました。PR [#1266](https://github.com/kohya-ss/sd-scripts/pull/1266) Zovjsra 氏に感謝します。
   - WSL 環境等でモデルファイルの読み込みが高速化されるようです。
   - `sdxl_train.py`、`sdxl_train_network.py`、`sdxl_train_textual_inversion.py`、`sdxl_train_control_net_lllite.py` で使用可能です。
@@ -253,6 +265,12 @@ https://github.com/kohya-ss/sd-scripts/pull/1290) frodo821 氏に感謝します
 
 - `gen_imgs.py` のプロンプトオプションに、保存時のファイル名を指定する `--f` オプションを追加しました。また同スクリプトで Diffusers ベースのキーを持つ LoRA の重みに対応しました。
 
+### Jun 23, 2024 / 2024-06-23: 
+
+- Fixed `cache_latents.py` and `cache_text_encoder_outputs.py` not working. (Will be included in the next release.)
+
+- `cache_latents.py` および `cache_text_encoder_outputs.py` が動作しなくなっていたのを修正しました。（次回リリースに含まれます。）
+
 ### Apr 7, 2024 / 2024-04-07: v0.8.7
 
 - The default value of `huber_schedule` in Scheduled Huber Loss is changed from `exponential` to `snr`, which is expected to give better results.
diff --git a/_typos.toml b/_typos.toml
index ae9e06b18..bbf7728f4 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -2,6 +2,7 @@
 # Instruction:  https://github.com/marketplace/actions/typos-action#getting-started
 
 [default.extend-identifiers]
+ddPn08="ddPn08"
 
 [default.extend-words]
 NIN="NIN"
@@ -27,6 +28,7 @@ rik="rik"
 koo="koo"
 yos="yos"
 wn="wn"
+hime="hime"
 
 
 [files]
diff --git a/library/ipex/attention.py b/library/ipex/attention.py
index d989ad53d..2bc62f65c 100644
--- a/library/ipex/attention.py
+++ b/library/ipex/attention.py
@@ -5,7 +5,7 @@
 
 # pylint: disable=protected-access, missing-function-docstring, line-too-long
 
-# ARC GPUs can't allocate more than 4GB to a single block so we slice the attetion layers
+# ARC GPUs can't allocate more than 4GB to a single block so we slice the attention layers
 
 sdpa_slice_trigger_rate = float(os.environ.get('IPEX_SDPA_SLICE_TRIGGER_RATE', 4))
 attention_slice_rate = float(os.environ.get('IPEX_ATTENTION_SLICE_RATE', 4))
diff --git a/library/train_util.py b/library/train_util.py
index a084f53e2..3b990bc0a 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -657,8 +657,16 @@ def set_caching_mode(self, mode):
 
     def set_current_epoch(self, epoch):
         if not self.current_epoch == epoch:  # epochが切り替わったらバケツをシャッフルする
-            self.shuffle_buckets()
-        self.current_epoch = epoch
+            if epoch > self.current_epoch:
+                logger.info("epoch is incremented. current_epoch: {}, epoch: {}".format(self.current_epoch, epoch))
+                num_epochs = epoch - self.current_epoch
+                for _ in range(num_epochs):
+                    self.current_epoch += 1
+                    self.shuffle_buckets()
+                # self.current_epoch seem to be set to 0 again in the next epoch. it may be caused by skipped_dataloader?
+            else:
+                logger.warning("epoch is not incremented. current_epoch: {}, epoch: {}".format(self.current_epoch, epoch))
+                self.current_epoch = epoch
 
     def set_current_step(self, step):
         self.current_step = step
@@ -1265,7 +1273,8 @@ def __getitem__(self, index):
                 if subset.alpha_mask:
                     if img.shape[2] == 4:
                         alpha_mask = img[:, :, 3]  # [H,W]
-                        alpha_mask = transforms.ToTensor()(alpha_mask)  # 0-255 -> 0-1
+                        alpha_mask = alpha_mask.astype(np.float32) / 255.0  # 0.0~1.0
+                        alpha_mask = torch.FloatTensor(alpha_mask)
                     else:
                         alpha_mask = torch.ones((img.shape[0], img.shape[1]), dtype=torch.float32)
                 else:
@@ -2211,7 +2220,7 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool, alph
 # 戻り値は、latents_tensor, (original_size width, original_size height), (crop left, crop top)
 def load_latents_from_disk(
     npz_path,
-) -> Tuple[Optional[torch.Tensor], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
+) -> Tuple[Optional[np.ndarray], Optional[List[int]], Optional[List[int]], Optional[np.ndarray], Optional[np.ndarray]]:
     npz = np.load(npz_path)
     if "latents" not in npz:
         raise ValueError(f"error: npz is old format. please re-generate {npz_path}")
@@ -2229,7 +2238,7 @@ def save_latents_to_disk(npz_path, latents_tensor, original_size, crop_ltrb, fli
     if flipped_latents_tensor is not None:
         kwargs["latents_flipped"] = flipped_latents_tensor.float().cpu().numpy()
     if alpha_mask is not None:
-        kwargs["alpha_mask"] = alpha_mask  # ndarray
+        kwargs["alpha_mask"] = alpha_mask.float().cpu().numpy()
     np.savez(
         npz_path,
         latents=latents_tensor.float().cpu().numpy(),
@@ -2425,16 +2434,20 @@ def load_arbitrary_dataset(args, tokenizer) -> MinimalDataset:
     return train_dataset_group
 
 
-def load_image(image_path, alpha=False):
-    image = Image.open(image_path)
-    if alpha:
-        if not image.mode == "RGBA":
-            image = image.convert("RGBA")
-    else:
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-    img = np.array(image, np.uint8)
-    return img
+def load_image(image_path, alpha=False):    
+    try:
+        with Image.open(image_path) as image:
+            if alpha:
+                if not image.mode == "RGBA":
+                    image = image.convert("RGBA")
+            else:
+                if not image.mode == "RGB":
+                    image = image.convert("RGB")
+            img = np.array(image, np.uint8)
+            return img
+    except (IOError, OSError) as e:
+        logger.error(f"Error loading file: {image_path}")
+        raise e
 
 
 # 画像を読み込む。戻り値はnumpy.ndarray,(original width, original height),(crop left, crop top, crop right, crop bottom)
@@ -2496,8 +2509,9 @@ def cache_batch_latents(
             if image.shape[2] == 4:
                 alpha_mask = image[:, :, 3]  # [H,W]
                 alpha_mask = alpha_mask.astype(np.float32) / 255.0
+                alpha_mask = torch.FloatTensor(alpha_mask)  # [H,W]
             else:
-                alpha_mask = np.ones_like(image[:, :, 0], dtype=np.float32)
+                alpha_mask = torch.ones_like(image[:, :, 0], dtype=torch.float32)  # [H,W]
         else:
             alpha_mask = None
         alpha_masks.append(alpha_mask)
@@ -5554,6 +5568,8 @@ def add(self, *, epoch: int, step: int, loss: float) -> None:
         if epoch == 0:
             self.loss_list.append(loss)
         else:
+            while len(self.loss_list) <= step:
+                self.loss_list.append(0.0)
             self.loss_total -= self.loss_list[step]
             self.loss_list[step] = loss
         self.loss_total += loss
diff --git a/networks/control_net_lllite_for_train.py b/networks/control_net_lllite_for_train.py
index 65b3520cf..366451b7f 100644
--- a/networks/control_net_lllite_for_train.py
+++ b/networks/control_net_lllite_for_train.py
@@ -7,8 +7,10 @@
 import torch
 from library import sdxl_original_unet
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 # input_blocksに適用するかどうか / if True, input_blocks are not applied
@@ -103,19 +105,15 @@ def set_lllite(self, depth, cond_emb_dim, name, mlp_dim, dropout=None, multiplie
         add_lllite_modules(self, in_dim, depth, cond_emb_dim, mlp_dim)
 
         self.cond_image = None
-        self.cond_emb = None
 
     def set_cond_image(self, cond_image):
         self.cond_image = cond_image
-        self.cond_emb = None
 
     def forward(self, x):
         if not self.enabled:
             return super().forward(x)
 
-        if self.cond_emb is None:
-            self.cond_emb = self.lllite_conditioning1(self.cond_image)
-        cx = self.cond_emb
+        cx = self.lllite_conditioning1(self.cond_image)  # make forward and backward compatible
 
         # reshape / b,c,h,w -> b,h*w,c
         n, c, h, w = cx.shape
@@ -159,9 +157,7 @@ def forward(self, x):  # , cond_image=None):
         if not self.enabled:
             return super().forward(x)
 
-        if self.cond_emb is None:
-            self.cond_emb = self.lllite_conditioning1(self.cond_image)
-        cx = self.cond_emb
+        cx = self.lllite_conditioning1(self.cond_image)
 
         cx = torch.cat([cx, self.down(x)], dim=1)
         cx = self.mid(cx)
diff --git a/sdxl_train.py b/sdxl_train.py
index 9e20c60ca..ae92d6a3d 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -481,6 +481,26 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             text_encoder2 = accelerator.prepare(text_encoder2)
         optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
+    # TextEncoderの出力をキャッシュするときにはCPUへ移動する
+    if args.cache_text_encoder_outputs:
+        # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
+        text_encoder1.to("cpu", dtype=torch.float32)
+        text_encoder2.to("cpu", dtype=torch.float32)
+        clean_memory_on_device(accelerator.device)
+    else:
+        # make sure Text Encoders are on GPU
+        text_encoder1.to(accelerator.device)
+        text_encoder2.to(accelerator.device)
+
+    # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
+    if args.full_fp16:
+        # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
+        # -> But we think it's ok to patch accelerator even if deepspeed is enabled.
+        train_util.patch_accelerator_for_fp16_training(accelerator)
+
+    # resumeする
+    train_util.resume_from_local_or_hf_if_specified(accelerator, args)
+
     if args.fused_backward_pass:
         # use fused optimizer for backward pass: other optimizers will be supported in the future
         import library.adafactor_fused
@@ -532,26 +552,6 @@ def optimizer_hook(parameter: torch.Tensor):
                         parameter_optimizer_map[parameter] = opt_idx
                         num_parameters_per_group[opt_idx] += 1
 
-    # TextEncoderの出力をキャッシュするときにはCPUへ移動する
-    if args.cache_text_encoder_outputs:
-        # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
-        text_encoder1.to("cpu", dtype=torch.float32)
-        text_encoder2.to("cpu", dtype=torch.float32)
-        clean_memory_on_device(accelerator.device)
-    else:
-        # make sure Text Encoders are on GPU
-        text_encoder1.to(accelerator.device)
-        text_encoder2.to(accelerator.device)
-
-    # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
-    if args.full_fp16:
-        # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do.
-        # -> But we think it's ok to patch accelerator even if deepspeed is enabled.
-        train_util.patch_accelerator_for_fp16_training(accelerator)
-
-    # resumeする
-    train_util.resume_from_local_or_hf_if_specified(accelerator, args)
-
     # epoch数を計算する
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
@@ -589,7 +589,11 @@ def optimizer_hook(parameter: torch.Tensor):
             init_kwargs["wandb"] = {"name": args.wandb_run_name}
         if args.log_tracker_config is not None:
             init_kwargs = toml.load(args.log_tracker_config)
-        accelerator.init_trackers("finetuning" if args.log_tracker_name is None else args.log_tracker_name, config=train_util.get_sanitized_config_or_none(args), init_kwargs=init_kwargs)
+        accelerator.init_trackers(
+            "finetuning" if args.log_tracker_name is None else args.log_tracker_name,
+            config=train_util.get_sanitized_config_or_none(args),
+            init_kwargs=init_kwargs,
+        )
 
     # For --sample_at_first
     sdxl_train_util.sample_images(
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 301310901..5ff060a9f 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -289,6 +289,9 @@ def train(args):
     # acceleratorがなんかよろしくやってくれるらしい
     unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
 
+    if isinstance(unet, DDP):
+        unet._set_static_graph() # avoid error for multiple use of the parameter
+
     if args.gradient_checkpointing:
         unet.train()  # according to TI example in Diffusers, train is required -> これオリジナルのU-Netしたので本当は外せる
     else:
diff --git a/tools/cache_latents.py b/tools/cache_latents.py
index b7c88121e..2f0098b42 100644
--- a/tools/cache_latents.py
+++ b/tools/cache_latents.py
@@ -16,8 +16,7 @@
     ConfigSanitizer,
     BlueprintGenerator,
 )
-from library.utils import setup_logging
-
+from library.utils import setup_logging, add_logging_arguments
 setup_logging()
 import logging
 
@@ -25,6 +24,7 @@
 
 
 def cache_to_disk(args: argparse.Namespace) -> None:
+    setup_logging(args, reset=True)
     train_util.prepare_dataset_args(args, True)
 
     # check cache latents arg
@@ -97,6 +97,7 @@ def cache_to_disk(args: argparse.Namespace) -> None:
 
     # acceleratorを準備する
     logger.info("prepare accelerator")
+    args.deepspeed = False
     accelerator = train_util.prepare_accelerator(args)
 
     # mixed precisionに対応した型を用意しておき適宜castする
@@ -176,6 +177,7 @@ def cache_to_disk(args: argparse.Namespace) -> None:
 def setup_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser()
 
+    add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
     train_util.add_training_arguments(parser, True)
     train_util.add_dataset_arguments(parser, True, True, True)
diff --git a/tools/cache_text_encoder_outputs.py b/tools/cache_text_encoder_outputs.py
index 5f1d6d201..a75d9da74 100644
--- a/tools/cache_text_encoder_outputs.py
+++ b/tools/cache_text_encoder_outputs.py
@@ -16,12 +16,13 @@
     ConfigSanitizer,
     BlueprintGenerator,
 )
-from library.utils import setup_logging
+from library.utils import setup_logging, add_logging_arguments
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
 
 def cache_to_disk(args: argparse.Namespace) -> None:
+    setup_logging(args, reset=True)
     train_util.prepare_dataset_args(args, True)
 
     # check cache arg
@@ -99,6 +100,7 @@ def cache_to_disk(args: argparse.Namespace) -> None:
 
     # acceleratorを準備する
     logger.info("prepare accelerator")
+    args.deepspeed = False
     accelerator = train_util.prepare_accelerator(args)
 
     # mixed precisionに対応した型を用意しておき適宜castする
@@ -171,6 +173,7 @@ def cache_to_disk(args: argparse.Namespace) -> None:
 def setup_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser()
 
+    add_logging_arguments(parser)
     train_util.add_sd_models_arguments(parser)
     train_util.add_training_arguments(parser, True)
     train_util.add_dataset_arguments(parser, True, True, True)
diff --git a/train_network.py b/train_network.py
index e575ea36c..13e2d3ff8 100644
--- a/train_network.py
+++ b/train_network.py
@@ -507,6 +507,15 @@ def save_model_hook(models, weights, output_dir):
                         weights.pop(i)
                 # print(f"save model hook: {len(weights)} weights will be saved")
 
+            # save current ecpoch and step
+            train_state_file = os.path.join(output_dir, "train_state.json")
+            # +1 is needed because the state is saved before current_step is set from global_step
+            logger.info(f"save train state to {train_state_file} at epoch {current_epoch.value} step {current_step.value+1}")
+            with open(train_state_file, "w", encoding="utf-8") as f:
+                json.dump({"current_epoch": current_epoch.value, "current_step": current_step.value + 1}, f)
+
+        steps_from_state = None
+
         def load_model_hook(models, input_dir):
             # remove models except network
             remove_indices = []
@@ -517,6 +526,15 @@ def load_model_hook(models, input_dir):
                 models.pop(i)
             # print(f"load model hook: {len(models)} models will be loaded")
 
+            # load current epoch and step to
+            nonlocal steps_from_state
+            train_state_file = os.path.join(input_dir, "train_state.json")
+            if os.path.exists(train_state_file):
+                with open(train_state_file, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                steps_from_state = data["current_step"]
+                logger.info(f"load train state from {train_state_file}: {data}")
+
         accelerator.register_save_state_pre_hook(save_model_hook)
         accelerator.register_load_state_pre_hook(load_model_hook)
 
@@ -760,7 +778,54 @@ def load_model_hook(models, input_dir):
             if key in metadata:
                 minimum_metadata[key] = metadata[key]
 
-        progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
+        # calculate steps to skip when resuming or starting from a specific step
+        initial_step = 0
+        if args.initial_epoch is not None or args.initial_step is not None:
+            # if initial_epoch or initial_step is specified, steps_from_state is ignored even when resuming
+            if steps_from_state is not None:
+                logger.warning(
+                    "steps from the state is ignored because initial_step is specified / initial_stepが指定されているため、stateからのステップ数は無視されます"
+                )
+            if args.initial_step is not None:
+                initial_step = args.initial_step
+            else:
+                # num steps per epoch is calculated by num_processes and gradient_accumulation_steps
+                initial_step = (args.initial_epoch - 1) * math.ceil(
+                    len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+                )
+        else:
+            # if initial_epoch and initial_step are not specified, steps_from_state is used when resuming
+            if steps_from_state is not None:
+                initial_step = steps_from_state
+                steps_from_state = None
+
+        if initial_step > 0:
+            assert (
+                args.max_train_steps > initial_step
+            ), f"max_train_steps should be greater than initial step / max_train_stepsは初期ステップより大きい必要があります: {args.max_train_steps} vs {initial_step}"
+
+        progress_bar = tqdm(
+            range(args.max_train_steps - initial_step), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps"
+        )
+
+        epoch_to_start = 0
+        if initial_step > 0:
+            if args.skip_until_initial_step:
+                # if skip_until_initial_step is specified, load data and discard it to ensure the same data is used
+                if not args.resume:
+                    logger.info(
+                        f"initial_step is specified but not resuming. lr scheduler will be started from the beginning / initial_stepが指定されていますがresumeしていないため、lr schedulerは最初から始まります"
+                    )
+                logger.info(f"skipping {initial_step} steps / {initial_step}ステップをスキップします")
+                initial_step *= args.gradient_accumulation_steps
+
+                # set epoch to start to make initial_step less than len(train_dataloader)
+                epoch_to_start = initial_step // math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+            else:
+                # if not, only epoch no is skipped for informative purpose
+                epoch_to_start = initial_step // math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+                initial_step = 0  # do not skip
+
         global_step = 0
 
         noise_scheduler = DDPMScheduler(
@@ -819,7 +884,13 @@ def remove_model(old_ckpt_name):
         self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
 
         # training loop
-        for epoch in range(num_train_epochs):
+        if initial_step > 0:  # only if skip_until_initial_step is specified
+            for skip_epoch in range(epoch_to_start):  # skip epochs
+                logger.info(f"skipping epoch {skip_epoch+1} because initial_step (multiplied) is {initial_step}")
+                initial_step -= len(train_dataloader)
+            global_step = initial_step
+
+        for epoch in range(epoch_to_start, num_train_epochs):
             accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")
             current_epoch.value = epoch + 1
 
@@ -827,8 +898,17 @@ def remove_model(old_ckpt_name):
 
             accelerator.unwrap_model(network).on_epoch_start(text_encoder, unet)
 
-            for step, batch in enumerate(train_dataloader):
+            skipped_dataloader = None
+            if initial_step > 0:
+                skipped_dataloader = accelerator.skip_first_batches(train_dataloader, initial_step - 1)
+                initial_step = 1
+
+            for step, batch in enumerate(skipped_dataloader or train_dataloader):
                 current_step.value = global_step
+                if initial_step > 0:
+                    initial_step -= 1
+                    continue
+
                 with accelerator.accumulate(training_model):
                     on_step_start(text_encoder, unet)
 
@@ -1129,6 +1209,25 @@ def setup_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う",
     )
+    parser.add_argument(
+        "--skip_until_initial_step",
+        action="store_true",
+        help="skip training until initial_step is reached / initial_stepに到達するまで学習をスキップする",
+    )
+    parser.add_argument(
+        "--initial_epoch",
+        type=int,
+        default=None,
+        help="initial epoch number, 1 means first epoch (same as not specifying). NOTE: initial_epoch/step doesn't affect to lr scheduler. Which means lr scheduler will start from 0 without `--resume`."
+        + " / 初期エポック数、1で最初のエポック（未指定時と同じ）。注意：initial_epoch/stepはlr schedulerに影響しないため、`--resume`しない場合はlr schedulerは0から始まる",
+    )
+    parser.add_argument(
+        "--initial_step",
+        type=int,
+        default=None,
+        help="initial step number including all epochs, 0 means first step (same as not specifying). overwrites initial_epoch."
+        + " / 初期ステップ数、全エポックを含むステップ数、0で最初のステップ（未指定時と同じ）。initial_epochを上書きする",
+    )
     # parser.add_argument("--loraplus_lr_ratio", default=None, type=float, help="LoRA+ learning rate ratio")
     # parser.add_argument("--loraplus_unet_lr_ratio", default=None, type=float, help="LoRA+ UNet learning rate ratio")
     # parser.add_argument("--loraplus_text_encoder_lr_ratio", default=None, type=float, help="LoRA+ text encoder learning rate ratio")