Merge branch 'master' into execution_model_inversion

guill · Aug 2, 2024 · 887ceb3 · 887ceb3
2 parents c4666bf + 369f459
commit 887ceb3
Show file tree

Hide file tree

Showing 66 changed files with 49,404 additions and 218 deletions.
diff --git a/.ci/update_windows/update.py b/.ci/update_windows/update.py
@@ -62,8 +62,15 @@ def pull(repo, remote_name='origin', branch='master'):
 
 print("checking out master branch")
 branch = repo.lookup_branch('master')
-ref = repo.lookup_reference(branch.name)
-repo.checkout(ref)
+if branch is None:
+    ref = repo.lookup_reference('refs/remotes/origin/master')
+    repo.checkout(ref)
+    branch = repo.lookup_branch('master')
+    if branch is None:
+        repo.create_branch('master', repo.get(ref.target))
+else:
+    ref = repo.lookup_reference(branch.name)
+    repo.checkout(ref)
 
 print("pulling latest changes")
 pull(repo)

diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml
@@ -106,4 +106,5 @@ jobs:
           file: ComfyUI_windows_portable_nvidia.7z
           tag: ${{ github.ref }}
           overwrite: true
-
+          prerelease: true
+          make_latest: false
diff --git a/.github/workflows/windows_release_dependencies.yml b/.github/workflows/windows_release_dependencies.yml
@@ -8,11 +8,16 @@ on:
         required: false
         type: string
         default: ""
+      extra_dependencies:
+        description: 'extra dependencies'
+        required: false
+        type: string
+        default: "\"numpy<2\""
       cu:
         description: 'cuda version'
         required: true
         type: string
-        default: "121"
+        default: "124"
 
       python_minor:
         description: 'python minor version'
@@ -24,7 +29,7 @@ on:
         description: 'python patch version'
         required: true
         type: string
-        default: "8"
+        default: "9"
 #  push:
 #    branches:
 #      - master
@@ -51,7 +56,7 @@ jobs:
             ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
             pause" > update_comfyui_and_python_dependencies.bat
 
-            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
+            python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
             python -m pip install --no-cache-dir ./temp_wheel_dir/*
             echo installed basic
             ls -lah temp_wheel_dir

diff --git a/.github/workflows/windows_release_nightly_pytorch.yml b/.github/workflows/windows_release_nightly_pytorch.yml
@@ -19,7 +19,7 @@ on:
         description: 'python patch version'
         required: true
         type: string
-        default: "3"
+        default: "4"
 #  push:
 #    branches:
 #      - master
@@ -49,7 +49,7 @@ jobs:
             echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
             curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
             ./python.exe get-pip.py
-            python -m pip wheel torch torchvision torchaudio mpmath==1.3.0 numpy==1.26.4 --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
+            python -m pip wheel torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
             ls ../temp_wheel_dir
             ./python.exe -s -m pip install --pre ../temp_wheel_dir/*
             sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth

diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml
@@ -7,7 +7,7 @@ on:
         description: 'cuda version'
         required: true
         type: string
-        default: "121"
+        default: "124"
 
       python_minor:
         description: 'python minor version'
@@ -19,7 +19,7 @@ on:
         description: 'python patch version'
         required: true
         type: string
-        default: "8"
+        default: "9"
 #  push:
 #    branches:
 #      - master

diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
 - Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/), [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/), [SD3](https://comfyanonymous.github.io/ComfyUI_examples/sd3/) and [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
+- [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
 - Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
@@ -33,6 +34,7 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
 - [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
 - [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
+- [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
 - Latent previews with [TAESD](#how-to-show-high-quality-previews)
 - Starts up very fast.
 - Works fully offline: will never download anything.
@@ -77,7 +79,7 @@ Ctrl can also be replaced with Cmd instead for macOS users
 
 There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).
 
-### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/download/latest/ComfyUI_windows_portable_nvidia_cu121_or_cpu.7z)
+### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z)
 
 Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints
 

diff --git a/comfy/cldm/cldm.py b/comfy/cldm/cldm.py
@@ -13,6 +13,7 @@
 from ..ldm.modules.attention import SpatialTransformer
 from ..ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample
 from ..ldm.util import exists
+from .control_types import UNION_CONTROLNET_TYPES
 from collections import OrderedDict
 import comfy.ops
 from comfy.ldm.modules.attention import optimized_attention
@@ -390,6 +391,18 @@ def forward(self, x, hint, timesteps, context, y=None, **kwargs):
         if self.control_add_embedding is not None: #Union Controlnet
             control_type = kwargs.get("control_type", [])
 
+            if any([c >= self.num_control_type for c in control_type]):
+                max_type = max(control_type)
+                max_type_name = {
+                    v: k for k, v in UNION_CONTROLNET_TYPES.items()
+                }[max_type]
+                raise ValueError(
+                    f"Control type {max_type_name}({max_type}) is out of range for the number of control types" +
+                    f"({self.num_control_type}) supported.\n" +
+                    "Please consider using the ProMax ControlNet Union model.\n" +
+                    "https://huggingface.co/xinsir/controlnet-union-sdxl-1.0/tree/main"
+                )
+
             emb += self.control_add_embedding(control_type, emb.dtype, emb.device)
             if len(control_type) > 0:
                 if len(hint.shape) < 5:

diff --git a/comfy/cldm/control_types.py b/comfy/cldm/control_types.py
@@ -0,0 +1,10 @@
+UNION_CONTROLNET_TYPES = {
+    "openpose": 0,
+    "depth": 1,
+    "hed/pidi/scribble/ted": 2,
+    "canny/lineart/anime_lineart/mlsd": 3,
+    "normal": 4,
+    "segment": 5,
+    "tile": 6,
+    "repaint": 7,
+}
diff --git a/comfy/clip_config_bigg.json b/comfy/clip_config_bigg.json
@@ -5,7 +5,7 @@
   "attention_dropout": 0.0,
   "bos_token_id": 0,
   "dropout": 0.0,
-  "eos_token_id": 2,
+  "eos_token_id": 49407,
   "hidden_act": "gelu",
   "hidden_size": 1280,
   "initializer_factor": 1.0,

diff --git a/comfy/clip_model.py b/comfy/clip_model.py
@@ -1,5 +1,6 @@
 import torch
 from comfy.ldm.modules.attention import optimized_attention_for_device
+import comfy.ops
 
 class CLIPAttention(torch.nn.Module):
     def __init__(self, embed_dim, heads, dtype, device, operations):
@@ -71,13 +72,13 @@ def forward(self, x, mask=None, intermediate_output=None):
         return x, intermediate
 
 class CLIPEmbeddings(torch.nn.Module):
-    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
+    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None, operations=None):
         super().__init__()
-        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
-        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+        self.token_embedding = operations.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = operations.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
 
-    def forward(self, input_tokens):
-        return self.token_embedding(input_tokens) + self.position_embedding.weight
+    def forward(self, input_tokens, dtype=torch.float32):
+        return self.token_embedding(input_tokens, out_dtype=dtype) + comfy.ops.cast_to(self.position_embedding.weight, dtype=dtype, device=input_tokens.device)
 
 
 class CLIPTextModel_(torch.nn.Module):
@@ -87,14 +88,15 @@ def __init__(self, config_dict, dtype, device, operations):
         heads = config_dict["num_attention_heads"]
         intermediate_size = config_dict["intermediate_size"]
         intermediate_activation = config_dict["hidden_act"]
+        self.eos_token_id = config_dict["eos_token_id"]
 
         super().__init__()
-        self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
+        self.embeddings = CLIPEmbeddings(embed_dim, dtype=dtype, device=device, operations=operations)
         self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
         self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
 
-    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True):
-        x = self.embeddings(input_tokens)
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+        x = self.embeddings(input_tokens, dtype=dtype)
         mask = None
         if attention_mask is not None:
             mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
@@ -111,7 +113,7 @@ def forward(self, input_tokens, attention_mask=None, intermediate_output=None, f
         if i is not None and final_layer_norm_intermediate:
             i = self.final_layer_norm(i)
 
-        pooled_output = x[torch.arange(x.shape[0], device=x.device), input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),]
+        pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
         return x, i, pooled_output
 
 class CLIPTextModel(torch.nn.Module):
@@ -153,11 +155,11 @@ def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, dty
 
         num_patches = (image_size // patch_size) ** 2
         num_positions = num_patches + 1
-        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+        self.position_embedding = operations.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
 
     def forward(self, pixel_values):
         embeds = self.patch_embedding(pixel_values).flatten(2).transpose(1, 2)
-        return torch.cat([self.class_embedding.to(embeds.device).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + self.position_embedding.weight.to(embeds.device)
+        return torch.cat([comfy.ops.cast_to_input(self.class_embedding, embeds).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + comfy.ops.cast_to_input(self.position_embedding.weight, embeds)
 
 
 class CLIPVision(torch.nn.Module):
@@ -169,7 +171,7 @@ def __init__(self, config_dict, dtype, device, operations):
         intermediate_size = config_dict["intermediate_size"]
         intermediate_activation = config_dict["hidden_act"]
 
-        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=torch.float32, device=device, operations=operations)
+        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=dtype, device=device, operations=operations)
         self.pre_layrnorm = operations.LayerNorm(embed_dim)
         self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
         self.post_layernorm = operations.LayerNorm(embed_dim)

diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py
@@ -139,3 +139,32 @@ def process_out(self, latent):
 
 class StableAudio1(LatentFormat):
     latent_channels = 64
+
+class Flux(SD3):
+    def __init__(self):
+        self.scale_factor = 0.3611
+        self.shift_factor = 0.1159
+        self.latent_rgb_factors =[
+            [-0.0404,  0.0159,  0.0609],
+            [ 0.0043,  0.0298,  0.0850],
+            [ 0.0328, -0.0749, -0.0503],
+            [-0.0245,  0.0085,  0.0549],
+            [ 0.0966,  0.0894,  0.0530],
+            [ 0.0035,  0.0399,  0.0123],
+            [ 0.0583,  0.1184,  0.1262],
+            [-0.0191, -0.0206, -0.0306],
+            [-0.0324,  0.0055,  0.1001],
+            [ 0.0955,  0.0659, -0.0545],
+            [-0.0504,  0.0231, -0.0013],
+            [ 0.0500, -0.0008, -0.0088],
+            [ 0.0982,  0.0941,  0.0976],
+            [-0.1233, -0.0280, -0.0897],
+            [-0.0005, -0.0530, -0.0020],
+            [-0.1273, -0.0932, -0.0680]
+        ]
+
+    def process_in(self, latent):
+        return (latent - self.shift_factor) * self.scale_factor
+
+    def process_out(self, latent):
+        return (latent / self.scale_factor) + self.shift_factor
diff --git a/comfy/ldm/audio/dit.py b/comfy/ldm/audio/dit.py
@@ -9,6 +9,7 @@
 from torch import nn
 from torch.nn import functional as F
 import math
+import comfy.ops
 
 class FourierFeatures(nn.Module):
     def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
@@ -18,7 +19,7 @@ def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
             [out_features // 2, in_features], dtype=dtype, device=device))
 
     def forward(self, input):
-        f = 2 * math.pi * input @ self.weight.T.to(dtype=input.dtype, device=input.device)
+        f = 2 * math.pi * input @ comfy.ops.cast_to_input(self.weight.T, input)
         return torch.cat([f.cos(), f.sin()], dim=-1)
 
 # norms
@@ -38,9 +39,9 @@ def __init__(self, dim, bias=False, fix_scale=False, dtype=None, device=None):
 
     def forward(self, x):
         beta = self.beta
-        if self.beta is not None:
-            beta = beta.to(dtype=x.dtype, device=x.device)
-        return F.layer_norm(x, x.shape[-1:], weight=self.gamma.to(dtype=x.dtype, device=x.device), bias=beta)
+        if beta is not None:
+            beta = comfy.ops.cast_to_input(beta, x)
+        return F.layer_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x), bias=beta)
 
 class GLU(nn.Module):
     def __init__(
@@ -123,16 +124,18 @@ def __init__(
         scale_base = 512,
         interpolation_factor = 1.,
         base = 10000,
-        base_rescale_factor = 1.
+        base_rescale_factor = 1.,
+        dtype=None,
+        device=None,
     ):
         super().__init__()
         # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
         # has some connection to NTK literature
         # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
         base *= base_rescale_factor ** (dim / (dim - 2))
 
-        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq)
+        # inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', torch.empty((dim // 2,), device=device, dtype=dtype))
 
         assert interpolation_factor >= 1.
         self.interpolation_factor = interpolation_factor
@@ -161,14 +164,14 @@ def forward(self, t):
 
         t = t / self.interpolation_factor
 
-        freqs = torch.einsum('i , j -> i j', t, self.inv_freq.to(dtype=dtype, device=device))
+        freqs = torch.einsum('i , j -> i j', t, comfy.ops.cast_to_input(self.inv_freq, t))
         freqs = torch.cat((freqs, freqs), dim = -1)
 
         if self.scale is None:
             return freqs, 1.
 
         power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base
-        scale = self.scale.to(dtype=dtype, device=device) ** rearrange(power, 'n -> n 1')
+        scale = comfy.ops.cast_to_input(self.scale, t) ** rearrange(power, 'n -> n 1')
         scale = torch.cat((scale, scale), dim = -1)
 
         return freqs, scale
@@ -568,7 +571,7 @@ def __init__(
         self.project_out = operations.Linear(dim, dim_out, bias=False, dtype=dtype, device=device) if dim_out is not None else nn.Identity()
 
         if rotary_pos_emb:
-            self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32))
+            self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32), device=device, dtype=dtype)
         else:
             self.rotary_pos_emb = None
 

diff --git a/comfy/ldm/aura/mmdit.py b/comfy/ldm/aura/mmdit.py
@@ -8,6 +8,7 @@
 import torch.nn.functional as F
 
 from comfy.ldm.modules.attention import optimized_attention
+import comfy.ops
 
 def modulate(x, shift, scale):
     return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
@@ -409,7 +410,7 @@ def patchify(self, x):
         pad_h = (self.patch_size - H % self.patch_size) % self.patch_size
         pad_w = (self.patch_size - W % self.patch_size) % self.patch_size
 
-        x = torch.nn.functional.pad(x, (0, pad_w, 0, pad_h), mode='reflect')
+        x = torch.nn.functional.pad(x, (0, pad_w, 0, pad_h), mode='circular')
         x = x.view(
             B,
             C,
@@ -427,7 +428,7 @@ def apply_pos_embeds(self, x, h, w):
         max_dim = max(h, w)
 
         cur_dim = self.h_max
-        pos_encoding = self.positional_encoding.reshape(1, cur_dim, cur_dim, -1).to(device=x.device, dtype=x.dtype)
+        pos_encoding = comfy.ops.cast_to_input(self.positional_encoding.reshape(1, cur_dim, cur_dim, -1), x)
 
         if max_dim > cur_dim:
             pos_encoding = F.interpolate(pos_encoding.movedim(-1, 1), (max_dim, max_dim), mode="bilinear").movedim(1, -1)
@@ -455,7 +456,7 @@ def forward(self, x, timestep, context, **kwargs):
         t = timestep
 
         c = self.cond_seq_linear(c_seq)  # B, T_c, D
-        c = torch.cat([self.register_tokens.to(device=c.device, dtype=c.dtype).repeat(c.size(0), 1, 1), c], dim=1)
+        c = torch.cat([comfy.ops.cast_to_input(self.register_tokens, c).repeat(c.size(0), 1, 1), c], dim=1)
 
         global_cond = self.t_embedder(t, x.dtype)  # B, D