Skip to content

Commit

Permalink
Merge branch 'master' into execution_model_inversion
Browse files Browse the repository at this point in the history
  • Loading branch information
guill committed Feb 22, 2024
2 parents e60dbe3 + f81dbe2 commit 5ab1565
Show file tree
Hide file tree
Showing 46 changed files with 2,408 additions and 254 deletions.
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ This ui will let you design and execute advanced stable diffusion pipelines usin

## Features
- Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
- Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/) and [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
- Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/) and [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/)
- Asynchronous Queue system
- Many optimizations: Only re-executes the parts of the workflow that changes between executions.
- Command line option: ```--lowvram``` to make it work on GPUs with less than 3GB vram (enabled automatically on GPUs with low vram)
Expand Down Expand Up @@ -95,24 +95,23 @@ Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints

Put your VAE in: models/vae

Note: pytorch stable does not support python 3.12 yet. If you have python 3.12 you will have to use the nightly version of pytorch. If you run into issues you should try python 3.11 instead.

### AMD GPUs (Linux only)
AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:

```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6```
```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7```

This is the command to install the nightly with ROCm 5.7 which has a python 3.12 package and might have some performance improvements:
This is the command to install the nightly with ROCm 6.0 which might have some performance improvements:

```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.7```
```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.0```

### NVIDIA

Nvidia users should install stable pytorch using this command:

```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121```

This is the command to install pytorch nightly instead which has a python 3.12 package and might have performance improvements:
This is the command to install pytorch nightly instead which might have performance improvements:

```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121```

Expand Down
2 changes: 1 addition & 1 deletion comfy/clip_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def forward(self, input_tokens, attention_mask=None, intermediate_output=None, f
x = self.embeddings(input_tokens)
mask = None
if attention_mask is not None:
mask = 1.0 - attention_mask.to(x.dtype).unsqueeze(1).unsqueeze(1).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))

causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
Expand Down
19 changes: 14 additions & 5 deletions comfy/controlnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def get_control(self, x_noisy, t, cond, batched_number):
if x_noisy.shape[0] != self.cond_hint.shape[0]:
self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)

context = cond['c_crossattn']
context = cond.get('crossattn_controlnet', cond['c_crossattn'])
y = cond.get('y', None)
if y is not None:
y = y.to(dtype)
Expand Down Expand Up @@ -318,9 +318,10 @@ def load_controlnet(ckpt_path, model=None):
return ControlLora(controlnet_data)

controlnet_config = None
supported_inference_dtypes = None

if "controlnet_cond_embedding.conv_in.weight" in controlnet_data: #diffusers format
unet_dtype = comfy.model_management.unet_dtype()
controlnet_config = comfy.model_detection.unet_config_from_diffusers_unet(controlnet_data, unet_dtype)
controlnet_config = comfy.model_detection.unet_config_from_diffusers_unet(controlnet_data)
diffusers_keys = comfy.utils.unet_to_diffusers(controlnet_config)
diffusers_keys["controlnet_mid_block.weight"] = "middle_block_out.0.weight"
diffusers_keys["controlnet_mid_block.bias"] = "middle_block_out.0.bias"
Expand Down Expand Up @@ -380,12 +381,20 @@ def load_controlnet(ckpt_path, model=None):
return net

if controlnet_config is None:
unet_dtype = comfy.model_management.unet_dtype()
controlnet_config = comfy.model_detection.model_config_from_unet(controlnet_data, prefix, unet_dtype, True).unet_config
model_config = comfy.model_detection.model_config_from_unet(controlnet_data, prefix, True)
supported_inference_dtypes = model_config.supported_inference_dtypes
controlnet_config = model_config.unet_config

load_device = comfy.model_management.get_torch_device()
if supported_inference_dtypes is None:
unet_dtype = comfy.model_management.unet_dtype()
else:
unet_dtype = comfy.model_management.unet_dtype(supported_dtypes=supported_inference_dtypes)

manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
if manual_cast_dtype is not None:
controlnet_config["operations"] = comfy.ops.manual_cast
controlnet_config["dtype"] = unet_dtype
controlnet_config.pop("out_channels")
controlnet_config["hint_channels"] = controlnet_data["{}input_hint_block.0.weight".format(prefix)].shape[1]
control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
Expand Down
52 changes: 27 additions & 25 deletions comfy/gligen.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from torch import nn
from .ldm.modules.attention import CrossAttention
from inspect import isfunction

import comfy.ops
ops = comfy.ops.manual_cast

def exists(val):
return val is not None
Expand All @@ -22,7 +23,7 @@ def default(val, d):
class GEGLU(nn.Module):
def __init__(self, dim_in, dim_out):
super().__init__()
self.proj = nn.Linear(dim_in, dim_out * 2)
self.proj = ops.Linear(dim_in, dim_out * 2)

def forward(self, x):
x, gate = self.proj(x).chunk(2, dim=-1)
Expand All @@ -35,14 +36,14 @@ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
inner_dim = int(dim * mult)
dim_out = default(dim_out, dim)
project_in = nn.Sequential(
nn.Linear(dim, inner_dim),
ops.Linear(dim, inner_dim),
nn.GELU()
) if not glu else GEGLU(dim, inner_dim)

self.net = nn.Sequential(
project_in,
nn.Dropout(dropout),
nn.Linear(inner_dim, dim_out)
ops.Linear(inner_dim, dim_out)
)

def forward(self, x):
Expand All @@ -57,11 +58,12 @@ def __init__(self, query_dim, context_dim, n_heads, d_head):
query_dim=query_dim,
context_dim=context_dim,
heads=n_heads,
dim_head=d_head)
dim_head=d_head,
operations=ops)
self.ff = FeedForward(query_dim, glu=True)

self.norm1 = nn.LayerNorm(query_dim)
self.norm2 = nn.LayerNorm(query_dim)
self.norm1 = ops.LayerNorm(query_dim)
self.norm2 = ops.LayerNorm(query_dim)

self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
Expand All @@ -87,17 +89,18 @@ def __init__(self, query_dim, context_dim, n_heads, d_head):

# we need a linear projection since we need cat visual feature and obj
# feature
self.linear = nn.Linear(context_dim, query_dim)
self.linear = ops.Linear(context_dim, query_dim)

self.attn = CrossAttention(
query_dim=query_dim,
context_dim=query_dim,
heads=n_heads,
dim_head=d_head)
dim_head=d_head,
operations=ops)
self.ff = FeedForward(query_dim, glu=True)

self.norm1 = nn.LayerNorm(query_dim)
self.norm2 = nn.LayerNorm(query_dim)
self.norm1 = ops.LayerNorm(query_dim)
self.norm2 = ops.LayerNorm(query_dim)

self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
Expand Down Expand Up @@ -126,14 +129,14 @@ def __init__(self, query_dim, context_dim, n_heads, d_head):

# we need a linear projection since we need cat visual feature and obj
# feature
self.linear = nn.Linear(context_dim, query_dim)
self.linear = ops.Linear(context_dim, query_dim)

self.attn = CrossAttention(
query_dim=query_dim, context_dim=query_dim, dim_head=d_head)
query_dim=query_dim, context_dim=query_dim, dim_head=d_head, operations=ops)
self.ff = FeedForward(query_dim, glu=True)

self.norm1 = nn.LayerNorm(query_dim)
self.norm2 = nn.LayerNorm(query_dim)
self.norm1 = ops.LayerNorm(query_dim)
self.norm2 = ops.LayerNorm(query_dim)

self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
Expand Down Expand Up @@ -201,11 +204,11 @@ def __init__(self, in_dim, out_dim, fourier_freqs=8):
self.position_dim = fourier_freqs * 2 * 4 # 2 is sin&cos, 4 is xyxy

self.linears = nn.Sequential(
nn.Linear(self.in_dim + self.position_dim, 512),
ops.Linear(self.in_dim + self.position_dim, 512),
nn.SiLU(),
nn.Linear(512, 512),
ops.Linear(512, 512),
nn.SiLU(),
nn.Linear(512, out_dim),
ops.Linear(512, out_dim),
)

self.null_positive_feature = torch.nn.Parameter(
Expand All @@ -215,16 +218,15 @@ def __init__(self, in_dim, out_dim, fourier_freqs=8):

def forward(self, boxes, masks, positive_embeddings):
B, N, _ = boxes.shape
dtype = self.linears[0].weight.dtype
masks = masks.unsqueeze(-1).to(dtype)
positive_embeddings = positive_embeddings.to(dtype)
masks = masks.unsqueeze(-1)
positive_embeddings = positive_embeddings

# embedding position (it may includes padding as placeholder)
xyxy_embedding = self.fourier_embedder(boxes.to(dtype)) # B*N*4 --> B*N*C
xyxy_embedding = self.fourier_embedder(boxes) # B*N*4 --> B*N*C

# learnable null embedding
positive_null = self.null_positive_feature.view(1, 1, -1)
xyxy_null = self.null_position_feature.view(1, 1, -1)
positive_null = self.null_positive_feature.to(device=boxes.device, dtype=boxes.dtype).view(1, 1, -1)
xyxy_null = self.null_position_feature.to(device=boxes.device, dtype=boxes.dtype).view(1, 1, -1)

# replace padding with learnable null embedding
positive_embeddings = positive_embeddings * \
Expand All @@ -251,7 +253,7 @@ def _set_position(self, boxes, masks, positive_embeddings):
def func(x, extra_options):
key = extra_options["transformer_index"]
module = self.module_list[key]
return module(x, objs)
return module(x, objs.to(device=x.device, dtype=x.dtype))
return func

def set_position(self, latent_image_shape, position_params, device):
Expand Down
38 changes: 38 additions & 0 deletions comfy/latent_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,41 @@ def __init__(self):
class SD_X4(LatentFormat):
def __init__(self):
self.scale_factor = 0.08333
self.latent_rgb_factors = [
[-0.2340, -0.3863, -0.3257],
[ 0.0994, 0.0885, -0.0908],
[-0.2833, -0.2349, -0.3741],
[ 0.2523, -0.0055, -0.1651]
]

class SC_Prior(LatentFormat):
def __init__(self):
self.scale_factor = 1.0
self.latent_rgb_factors = [
[-0.0326, -0.0204, -0.0127],
[-0.1592, -0.0427, 0.0216],
[ 0.0873, 0.0638, -0.0020],
[-0.0602, 0.0442, 0.1304],
[ 0.0800, -0.0313, -0.1796],
[-0.0810, -0.0638, -0.1581],
[ 0.1791, 0.1180, 0.0967],
[ 0.0740, 0.1416, 0.0432],
[-0.1745, -0.1888, -0.1373],
[ 0.2412, 0.1577, 0.0928],
[ 0.1908, 0.0998, 0.0682],
[ 0.0209, 0.0365, -0.0092],
[ 0.0448, -0.0650, -0.1728],
[-0.1658, -0.1045, -0.1308],
[ 0.0542, 0.1545, 0.1325],
[-0.0352, -0.1672, -0.2541]
]

class SC_B(LatentFormat):
def __init__(self):
self.scale_factor = 1.0
self.latent_rgb_factors = [
[ 0.1121, 0.2006, 0.1023],
[-0.2093, -0.0222, -0.0195],
[-0.3087, -0.1535, 0.0366],
[ 0.0290, -0.1574, -0.4078]
]
Loading

0 comments on commit 5ab1565

Please sign in to comment.