Skip to content

Commit

Permalink
Merge branch 'master' into execution_model_inversion
Browse files Browse the repository at this point in the history
  • Loading branch information
guill committed Aug 2, 2024
2 parents c4666bf + 369f459 commit 887ceb3
Show file tree
Hide file tree
Showing 66 changed files with 49,404 additions and 218 deletions.
11 changes: 9 additions & 2 deletions .ci/update_windows/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,15 @@ def pull(repo, remote_name='origin', branch='master'):

print("checking out master branch")
branch = repo.lookup_branch('master')
ref = repo.lookup_reference(branch.name)
repo.checkout(ref)
if branch is None:
ref = repo.lookup_reference('refs/remotes/origin/master')
repo.checkout(ref)
branch = repo.lookup_branch('master')
if branch is None:
repo.create_branch('master', repo.get(ref.target))
else:
ref = repo.lookup_reference(branch.name)
repo.checkout(ref)

print("pulling latest changes")
pull(repo)
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/stable-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,5 @@ jobs:
file: ComfyUI_windows_portable_nvidia.7z
tag: ${{ github.ref }}
overwrite: true

prerelease: true
make_latest: false
11 changes: 8 additions & 3 deletions .github/workflows/windows_release_dependencies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@ on:
required: false
type: string
default: ""
extra_dependencies:
description: 'extra dependencies'
required: false
type: string
default: "\"numpy<2\""
cu:
description: 'cuda version'
required: true
type: string
default: "121"
default: "124"

python_minor:
description: 'python minor version'
Expand All @@ -24,7 +29,7 @@ on:
description: 'python patch version'
required: true
type: string
default: "8"
default: "9"
# push:
# branches:
# - master
Expand All @@ -51,7 +56,7 @@ jobs:
..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
pause" > update_comfyui_and_python_dependencies.bat
python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir
python -m pip install --no-cache-dir ./temp_wheel_dir/*
echo installed basic
ls -lah temp_wheel_dir
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/windows_release_nightly_pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ on:
description: 'python patch version'
required: true
type: string
default: "3"
default: "4"
# push:
# branches:
# - master
Expand Down Expand Up @@ -49,7 +49,7 @@ jobs:
echo 'import site' >> ./python3${{ inputs.python_minor }}._pth
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
./python.exe get-pip.py
python -m pip wheel torch torchvision torchaudio mpmath==1.3.0 numpy==1.26.4 --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
python -m pip wheel torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 -w ../temp_wheel_dir
ls ../temp_wheel_dir
./python.exe -s -m pip install --pre ../temp_wheel_dir/*
sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/windows_release_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
description: 'cuda version'
required: true
type: string
default: "121"
default: "124"

python_minor:
description: 'python minor version'
Expand All @@ -19,7 +19,7 @@ on:
description: 'python patch version'
required: true
type: string
default: "8"
default: "9"
# push:
# branches:
# - master
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
## Features
- Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
- Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/), [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/), [SD3](https://comfyanonymous.github.io/ComfyUI_examples/sd3/) and [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
- [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
- Asynchronous Queue system
- Many optimizations: Only re-executes the parts of the workflow that changes between executions.
- Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
Expand All @@ -33,6 +34,7 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
- [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
- [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
- [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
- [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
- Latent previews with [TAESD](#how-to-show-high-quality-previews)
- Starts up very fast.
- Works fully offline: will never download anything.
Expand Down Expand Up @@ -77,7 +79,7 @@ Ctrl can also be replaced with Cmd instead for macOS users

There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).

### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/download/latest/ComfyUI_windows_portable_nvidia_cu121_or_cpu.7z)
### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z)

Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints

Expand Down
13 changes: 13 additions & 0 deletions comfy/cldm/cldm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from ..ldm.modules.attention import SpatialTransformer
from ..ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample
from ..ldm.util import exists
from .control_types import UNION_CONTROLNET_TYPES
from collections import OrderedDict
import comfy.ops
from comfy.ldm.modules.attention import optimized_attention
Expand Down Expand Up @@ -390,6 +391,18 @@ def forward(self, x, hint, timesteps, context, y=None, **kwargs):
if self.control_add_embedding is not None: #Union Controlnet
control_type = kwargs.get("control_type", [])

if any([c >= self.num_control_type for c in control_type]):
max_type = max(control_type)
max_type_name = {
v: k for k, v in UNION_CONTROLNET_TYPES.items()
}[max_type]
raise ValueError(
f"Control type {max_type_name}({max_type}) is out of range for the number of control types" +
f"({self.num_control_type}) supported.\n" +
"Please consider using the ProMax ControlNet Union model.\n" +
"https://huggingface.co/xinsir/controlnet-union-sdxl-1.0/tree/main"
)

emb += self.control_add_embedding(control_type, emb.dtype, emb.device)
if len(control_type) > 0:
if len(hint.shape) < 5:
Expand Down
10 changes: 10 additions & 0 deletions comfy/cldm/control_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
UNION_CONTROLNET_TYPES = {
"openpose": 0,
"depth": 1,
"hed/pidi/scribble/ted": 2,
"canny/lineart/anime_lineart/mlsd": 3,
"normal": 4,
"segment": 5,
"tile": 6,
"repaint": 7,
}
2 changes: 1 addition & 1 deletion comfy/clip_config_bigg.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"attention_dropout": 0.0,
"bos_token_id": 0,
"dropout": 0.0,
"eos_token_id": 2,
"eos_token_id": 49407,
"hidden_act": "gelu",
"hidden_size": 1280,
"initializer_factor": 1.0,
Expand Down
26 changes: 14 additions & 12 deletions comfy/clip_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import torch
from comfy.ldm.modules.attention import optimized_attention_for_device
import comfy.ops

class CLIPAttention(torch.nn.Module):
def __init__(self, embed_dim, heads, dtype, device, operations):
Expand Down Expand Up @@ -71,13 +72,13 @@ def forward(self, x, mask=None, intermediate_output=None):
return x, intermediate

class CLIPEmbeddings(torch.nn.Module):
def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None, operations=None):
super().__init__()
self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
self.token_embedding = operations.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
self.position_embedding = operations.Embedding(num_positions, embed_dim, dtype=dtype, device=device)

def forward(self, input_tokens):
return self.token_embedding(input_tokens) + self.position_embedding.weight
def forward(self, input_tokens, dtype=torch.float32):
return self.token_embedding(input_tokens, out_dtype=dtype) + comfy.ops.cast_to(self.position_embedding.weight, dtype=dtype, device=input_tokens.device)


class CLIPTextModel_(torch.nn.Module):
Expand All @@ -87,14 +88,15 @@ def __init__(self, config_dict, dtype, device, operations):
heads = config_dict["num_attention_heads"]
intermediate_size = config_dict["intermediate_size"]
intermediate_activation = config_dict["hidden_act"]
self.eos_token_id = config_dict["eos_token_id"]

super().__init__()
self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
self.embeddings = CLIPEmbeddings(embed_dim, dtype=dtype, device=device, operations=operations)
self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)

def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True):
x = self.embeddings(input_tokens)
def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
x = self.embeddings(input_tokens, dtype=dtype)
mask = None
if attention_mask is not None:
mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
Expand All @@ -111,7 +113,7 @@ def forward(self, input_tokens, attention_mask=None, intermediate_output=None, f
if i is not None and final_layer_norm_intermediate:
i = self.final_layer_norm(i)

pooled_output = x[torch.arange(x.shape[0], device=x.device), input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),]
pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
return x, i, pooled_output

class CLIPTextModel(torch.nn.Module):
Expand Down Expand Up @@ -153,11 +155,11 @@ def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, dty

num_patches = (image_size // patch_size) ** 2
num_positions = num_patches + 1
self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
self.position_embedding = operations.Embedding(num_positions, embed_dim, dtype=dtype, device=device)

def forward(self, pixel_values):
embeds = self.patch_embedding(pixel_values).flatten(2).transpose(1, 2)
return torch.cat([self.class_embedding.to(embeds.device).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + self.position_embedding.weight.to(embeds.device)
return torch.cat([comfy.ops.cast_to_input(self.class_embedding, embeds).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + comfy.ops.cast_to_input(self.position_embedding.weight, embeds)


class CLIPVision(torch.nn.Module):
Expand All @@ -169,7 +171,7 @@ def __init__(self, config_dict, dtype, device, operations):
intermediate_size = config_dict["intermediate_size"]
intermediate_activation = config_dict["hidden_act"]

self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=torch.float32, device=device, operations=operations)
self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=dtype, device=device, operations=operations)
self.pre_layrnorm = operations.LayerNorm(embed_dim)
self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
self.post_layernorm = operations.LayerNorm(embed_dim)
Expand Down
29 changes: 29 additions & 0 deletions comfy/latent_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,32 @@ def process_out(self, latent):

class StableAudio1(LatentFormat):
latent_channels = 64

class Flux(SD3):
def __init__(self):
self.scale_factor = 0.3611
self.shift_factor = 0.1159
self.latent_rgb_factors =[
[-0.0404, 0.0159, 0.0609],
[ 0.0043, 0.0298, 0.0850],
[ 0.0328, -0.0749, -0.0503],
[-0.0245, 0.0085, 0.0549],
[ 0.0966, 0.0894, 0.0530],
[ 0.0035, 0.0399, 0.0123],
[ 0.0583, 0.1184, 0.1262],
[-0.0191, -0.0206, -0.0306],
[-0.0324, 0.0055, 0.1001],
[ 0.0955, 0.0659, -0.0545],
[-0.0504, 0.0231, -0.0013],
[ 0.0500, -0.0008, -0.0088],
[ 0.0982, 0.0941, 0.0976],
[-0.1233, -0.0280, -0.0897],
[-0.0005, -0.0530, -0.0020],
[-0.1273, -0.0932, -0.0680]
]

def process_in(self, latent):
return (latent - self.shift_factor) * self.scale_factor

def process_out(self, latent):
return (latent / self.scale_factor) + self.shift_factor
23 changes: 13 additions & 10 deletions comfy/ldm/audio/dit.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from torch import nn
from torch.nn import functional as F
import math
import comfy.ops

class FourierFeatures(nn.Module):
def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
Expand All @@ -18,7 +19,7 @@ def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
[out_features // 2, in_features], dtype=dtype, device=device))

def forward(self, input):
f = 2 * math.pi * input @ self.weight.T.to(dtype=input.dtype, device=input.device)
f = 2 * math.pi * input @ comfy.ops.cast_to_input(self.weight.T, input)
return torch.cat([f.cos(), f.sin()], dim=-1)

# norms
Expand All @@ -38,9 +39,9 @@ def __init__(self, dim, bias=False, fix_scale=False, dtype=None, device=None):

def forward(self, x):
beta = self.beta
if self.beta is not None:
beta = beta.to(dtype=x.dtype, device=x.device)
return F.layer_norm(x, x.shape[-1:], weight=self.gamma.to(dtype=x.dtype, device=x.device), bias=beta)
if beta is not None:
beta = comfy.ops.cast_to_input(beta, x)
return F.layer_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x), bias=beta)

class GLU(nn.Module):
def __init__(
Expand Down Expand Up @@ -123,16 +124,18 @@ def __init__(
scale_base = 512,
interpolation_factor = 1.,
base = 10000,
base_rescale_factor = 1.
base_rescale_factor = 1.,
dtype=None,
device=None,
):
super().__init__()
# proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
# has some connection to NTK literature
# https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
base *= base_rescale_factor ** (dim / (dim - 2))

inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
self.register_buffer('inv_freq', inv_freq)
# inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
self.register_buffer('inv_freq', torch.empty((dim // 2,), device=device, dtype=dtype))

assert interpolation_factor >= 1.
self.interpolation_factor = interpolation_factor
Expand Down Expand Up @@ -161,14 +164,14 @@ def forward(self, t):

t = t / self.interpolation_factor

freqs = torch.einsum('i , j -> i j', t, self.inv_freq.to(dtype=dtype, device=device))
freqs = torch.einsum('i , j -> i j', t, comfy.ops.cast_to_input(self.inv_freq, t))
freqs = torch.cat((freqs, freqs), dim = -1)

if self.scale is None:
return freqs, 1.

power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base
scale = self.scale.to(dtype=dtype, device=device) ** rearrange(power, 'n -> n 1')
scale = comfy.ops.cast_to_input(self.scale, t) ** rearrange(power, 'n -> n 1')
scale = torch.cat((scale, scale), dim = -1)

return freqs, scale
Expand Down Expand Up @@ -568,7 +571,7 @@ def __init__(
self.project_out = operations.Linear(dim, dim_out, bias=False, dtype=dtype, device=device) if dim_out is not None else nn.Identity()

if rotary_pos_emb:
self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32))
self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32), device=device, dtype=dtype)
else:
self.rotary_pos_emb = None

Expand Down
7 changes: 4 additions & 3 deletions comfy/ldm/aura/mmdit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import torch.nn.functional as F

from comfy.ldm.modules.attention import optimized_attention
import comfy.ops

def modulate(x, shift, scale):
return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
Expand Down Expand Up @@ -409,7 +410,7 @@ def patchify(self, x):
pad_h = (self.patch_size - H % self.patch_size) % self.patch_size
pad_w = (self.patch_size - W % self.patch_size) % self.patch_size

x = torch.nn.functional.pad(x, (0, pad_w, 0, pad_h), mode='reflect')
x = torch.nn.functional.pad(x, (0, pad_w, 0, pad_h), mode='circular')
x = x.view(
B,
C,
Expand All @@ -427,7 +428,7 @@ def apply_pos_embeds(self, x, h, w):
max_dim = max(h, w)

cur_dim = self.h_max
pos_encoding = self.positional_encoding.reshape(1, cur_dim, cur_dim, -1).to(device=x.device, dtype=x.dtype)
pos_encoding = comfy.ops.cast_to_input(self.positional_encoding.reshape(1, cur_dim, cur_dim, -1), x)

if max_dim > cur_dim:
pos_encoding = F.interpolate(pos_encoding.movedim(-1, 1), (max_dim, max_dim), mode="bilinear").movedim(1, -1)
Expand Down Expand Up @@ -455,7 +456,7 @@ def forward(self, x, timestep, context, **kwargs):
t = timestep

c = self.cond_seq_linear(c_seq) # B, T_c, D
c = torch.cat([self.register_tokens.to(device=c.device, dtype=c.dtype).repeat(c.size(0), 1, 1), c], dim=1)
c = torch.cat([comfy.ops.cast_to_input(self.register_tokens, c).repeat(c.size(0), 1, 1), c], dim=1)

global_cond = self.t_embedder(t, x.dtype) # B, D

Expand Down
Loading

0 comments on commit 887ceb3

Please sign in to comment.