from original file loader

huggingface · a-r-r-o-w · Dec 6, 2024 · Oct 18, 2024 · Oct 23, 2024 · Oct 23, 2024
commit d6c748c7e665cbf0f4392dedd2b4b32253b3d462
diff --git a/scripts/convert_dcae_to_diffusers.py b/scripts/convert_dcae_to_diffusers.py
@@ -8,10 +8,6 @@
 from diffusers import AutoencoderDC
 
 
-def remove_keys_(key: str, state_dict: Dict[str, Any]):
-    state_dict.pop(key)
-
-
 def remap_qkv_(key: str, state_dict: Dict[str, Any]):
     qkv = state_dict.pop(key)
     q, k, v = torch.chunk(qkv, 3, dim=0)

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
@@ -23,12 +23,14 @@
 from .single_file_utils import (
     SingleFileComponentError,
     convert_animatediff_checkpoint_to_diffusers,
+    convert_autoencoder_dc_checkpoint_to_diffusers,
     convert_controlnet_checkpoint,
     convert_flux_transformer_checkpoint_to_diffusers,
     convert_ldm_unet_checkpoint,
     convert_ldm_vae_checkpoint,
     convert_sd3_transformer_checkpoint_to_diffusers,
     convert_stable_cascade_unet_single_file_to_diffusers,
+    create_autoencoder_dc_config_from_original,
     create_controlnet_diffusers_config_from_ldm,
     create_unet_diffusers_config_from_ldm,
     create_vae_diffusers_config_from_ldm,
@@ -82,6 +84,10 @@
         "checkpoint_mapping_fn": convert_flux_transformer_checkpoint_to_diffusers,
         "default_subfolder": "transformer",
     },
+    "AutoencoderDC": {
+        "checkpoint_mapping_fn": convert_autoencoder_dc_checkpoint_to_diffusers,
+        "config_mapping_fn": create_autoencoder_dc_config_from_original,
+    },
 }
 
 
@@ -228,7 +234,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
             if config_mapping_fn is None:
                 raise ValueError(
                     (
-                        f"`original_config` has been provided for {mapping_class_name} but no mapping function"
+                        f"`original_config` has been provided for {mapping_class_name} but no mapping function "
                         "was found to convert the original config to a Diffusers config in"
                         "`diffusers.loaders.single_file_utils`"
                     )

diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
@@ -12,7 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the Stable Diffusion checkpoints."""
+
+"""
+Conversion scripts for the various modeling checkpoints. These scripts convert original model implementations to
+Diffusers adapted versions. This usually only involves renaming/remapping the state dict keys and changing some
+modeling components partially (for example, splitting a single QKV linear to individual Q, K, V layers).
+"""
 
 import copy
 import os
@@ -92,6 +97,7 @@
         "double_blocks.0.img_attn.norm.key_norm.scale",
         "model.diffusion_model.double_blocks.0.img_attn.norm.key_norm.scale",
     ],
+    "autoencoder_dc": "decoder.stages.0.op_list.0.main.conv.conv.weight",
 }
 
 DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
@@ -2198,3 +2204,251 @@ def swap_scale_shift(weight):
     )
 
     return converted_state_dict
+
+
+def create_autoencoder_dc_config_from_original(original_config, checkpoint, **kwargs):
+    model_name = original_config.get("model_name", "dc-ae-f32c32-sana-1.0")
+    print("trying:", model_name)
+
+    if model_name in ["dc-ae-f32c32-sana-1.0"]:
+        config = {
+            "latent_channels": 32,
+            "encoder_block_types": (
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ),
+            "decoder_block_types": (
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ),
+            "encoder_block_out_channels": (128, 256, 512, 512, 1024, 1024),
+            "decoder_block_out_channels": (128, 256, 512, 512, 1024, 1024),
+            "encoder_qkv_multiscales": ((), (), (), (5,), (5,), (5,)),
+            "decoder_qkv_multiscales": ((), (), (), (5,), (5,), (5,)),
+            "encoder_layers_per_block": (2, 2, 2, 3, 3, 3),
+            "decoder_layers_per_block": [3, 3, 3, 3, 3, 3],
+            "downsample_block_type": "conv",
+            "upsample_block_type": "interpolate",
+            "decoder_norm_types": "rms_norm",
+            "decoder_act_fns": "silu",
+            "scaling_factor": 0.41407,
+        }
+    elif model_name in ["dc-ae-f32c32-in-1.0", "dc-ae-f32c32-mix-1.0"]:
+        config = {
+            "latent_channels": 32,
+            "encoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "decoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
+            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
+            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2],
+            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2],
+            "encoder_qkv_multiscales": ((), (), (), (), (), ()),
+            "decoder_qkv_multiscales": ((), (), (), (), (), ()),
+            "decoder_norm_types": ["batch_norm", "batch_norm", "batch_norm", "rms_norm", "rms_norm", "rms_norm"],
+            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu"],
+        }
+        if model_name == "dc-ae-f32c32-in-1.0":
+            config["scaling_factor"] = 0.3189
+        elif model_name == "dc-ae-f32c32-mix-1.0":
+            config["scaling_factor"] = 0.4552
+    elif model_name in ["dc-ae-f64c128-in-1.0", "dc-ae-f64c128-mix-1.0"]:
+        config = {
+            "latent_channels": 128,
+            "encoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "decoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048],
+            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048],
+            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2, 2],
+            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2, 2],
+            "encoder_qkv_multiscales": ((), (), (), (), (), (), ()),
+            "decoder_qkv_multiscales": ((), (), (), (), (), (), ()),
+            "decoder_norm_types": [
+                "batch_norm",
+                "batch_norm",
+                "batch_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+            ],
+            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu", "silu"],
+        }
+        if model_name == "dc-ae-f64c128-in-1.0":
+            config["scaling_factor"] = 0.2889
+        elif model_name == "dc-ae-f64c128-mix-1.0":
+            config["scaling_factor"] = 0.4538
+    elif model_name in ["dc-ae-f128c512-in-1.0", "dc-ae-f128c512-mix-1.0"]:
+        config = {
+            "latent_channels": 512,
+            "encoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "decoder_block_types": [
+                "ResBlock",
+                "ResBlock",
+                "ResBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+                "EfficientViTBlock",
+            ],
+            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048, 2048],
+            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048, 2048],
+            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2, 2, 2],
+            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2, 2, 2],
+            "encoder_qkv_multiscales": ((), (), (), (), (), (), (), ()),
+            "decoder_qkv_multiscales": ((), (), (), (), (), (), (), ()),
+            "decoder_norm_types": [
+                "batch_norm",
+                "batch_norm",
+                "batch_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+                "rms_norm",
+            ],
+            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu", "silu", "silu"],
+        }
+        if model_name == "dc-ae-f128c512-in-1.0":
+            config["scaling_factor"] = 0.4883
+        elif model_name == "dc-ae-f128c512-mix-1.0":
+            config["scaling_factor"] = 0.3620
+
+    config.update({"model_name": model_name})
+
+    return config
+
+
+def convert_autoencoder_dc_checkpoint_to_diffusers(config, checkpoint, **kwargs):
+    converted_state_dict = {key: checkpoint.pop(key) for key in list(checkpoint.keys())}
+    model_name = config.pop("model_name")
+
+    def remap_qkv_(key: str, state_dict):
+        qkv = state_dict.pop(key)
+        q, k, v = torch.chunk(qkv, 3, dim=0)
+        parent_module, _, _ = key.rpartition(".qkv.conv.weight")
+        state_dict[f"{parent_module}.to_q.weight"] = q.squeeze()
+        state_dict[f"{parent_module}.to_k.weight"] = k.squeeze()
+        state_dict[f"{parent_module}.to_v.weight"] = v.squeeze()
+
+    def remap_proj_conv_(key: str, state_dict):
+        parent_module, _, _ = key.rpartition(".proj.conv.weight")
+        state_dict[f"{parent_module}.to_out.weight"] = state_dict.pop(key).squeeze()
+
+    AE_KEYS_RENAME_DICT = {
+        # common
+        "main.": "",
+        "op_list.": "",
+        "context_module": "attn",
+        "local_module": "conv_out",
+        # NOTE: The below two lines work because scales in the available configs only have a tuple length of 1
+        # If there were more scales, there would be more layers, so a loop would be better to handle this
+        "aggreg.0.0": "to_qkv_multiscale.0.proj_in",
+        "aggreg.0.1": "to_qkv_multiscale.0.proj_out",
+        "depth_conv.conv": "conv_depth",
+        "inverted_conv.conv": "conv_inverted",
+        "point_conv.conv": "conv_point",
+        "point_conv.norm": "norm",
+        "conv.conv.": "conv.",
+        "conv1.conv": "conv1",
+        "conv2.conv": "conv2",
+        "conv2.norm": "norm",
+        "proj.norm": "norm_out",
+        # encoder
+        "encoder.project_in.conv": "encoder.conv_in",
+        "encoder.project_out.0.conv": "encoder.conv_out",
+        "encoder.stages": "encoder.down_blocks",
+        # decoder
+        "decoder.project_in.conv": "decoder.conv_in",
+        "decoder.project_out.0": "decoder.norm_out",
+        "decoder.project_out.2.conv": "decoder.conv_out",
+        "decoder.stages": "decoder.up_blocks",
+    }
+
+    AE_F32C32_KEYS = {
+        "encoder.project_in.conv": "encoder.conv_in.conv",
+        "decoder.project_out.2.conv": "decoder.conv_out.conv",
+    }
+
+    AE_F64C128_KEYS = {
+        "encoder.project_in.conv": "encoder.conv_in.conv",
+        "decoder.project_out.2.conv": "decoder.conv_out.conv",
+    }
+
+    AE_F128C512_KEYS = {
+        "encoder.project_in.conv": "encoder.conv_in.conv",
+        "decoder.project_out.2.conv": "decoder.conv_out.conv",
+    }
+
+    AE_SPECIAL_KEYS_REMAP = {
+        "qkv.conv.weight": remap_qkv_,
+        "proj.conv.weight": remap_proj_conv_,
+    }
+
+    if "f32c32" in model_name and "sana" not in model_name:
+        AE_KEYS_RENAME_DICT.update(AE_F32C32_KEYS)
+    elif "f64c128" in model_name:
+        AE_KEYS_RENAME_DICT.update(AE_F64C128_KEYS)
+    elif "f128c512" in model_name:
+        AE_KEYS_RENAME_DICT.update(AE_F128C512_KEYS)
+
+    for key in list(converted_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in AE_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        converted_state_dict[new_key] = converted_state_dict.pop(key)
+
+    for key in list(converted_state_dict.keys()):
+        for special_key, handler_fn_inplace in AE_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, converted_state_dict)
+
+    return converted_state_dict
diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py
@@ -20,6 +20,7 @@
 import torch.nn.functional as F
 
 from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin
 from ..activations import get_activation
 from ..attention_processor import SanaMultiscaleLinearAttention
 from ..modeling_utils import ModelMixin
@@ -394,7 +395,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-class AutoencoderDC(ModelMixin, ConfigMixin):
+class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin):
     r"""
     An Autoencoder model introduced in [DCAE](https://arxiv.org/abs/2410.10733) and used in
     [SANA](https://arxiv.org/abs/2410.10629).