diff --git a/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py b/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py index 53d57188743de..19f62e789e2de 100644 --- a/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +++ b/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py @@ -19,10 +19,21 @@ from transformers import CLIPPreTrainedModel from transformers.modeling_outputs import BaseModelOutputWithPooling from transformers.models.clip.configuration_clip import CLIPTextConfig -from transformers.models.clip.modeling_clip import ( - CLIPEncoder, - _expand_mask, -) +from transformers.models.clip.modeling_clip import CLIPEncoder + + +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) # This is a modified version of the CLIPTextModel from transformers.models.clip.modeling_clip