Skip to content

Commit

Permalink
[Bugfix][VLM] Fix incompatibility between #7902 and #7230 (#7948)
Browse files Browse the repository at this point in the history
  • Loading branch information
DarkLight1337 authored Aug 28, 2024
1 parent 98c12cf commit ef9baee
Show file tree
Hide file tree
Showing 10 changed files with 120 additions and 92 deletions.
4 changes: 2 additions & 2 deletions vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@
class Blip2ImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: (batch_size, num_channels, height, width)"""
"""Shape: `(batch_size * num_images, num_channels, height, width)`"""


class Blip2ImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: torch.Tensor
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
class ChameleonImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: `(batch_size, num_channels, height, width)`"""
"""Shape: `(batch_size * num_images, num_channels, height, width)`"""


def get_max_chameleon_image_tokens(ctx: InputContext):
Expand Down
46 changes: 15 additions & 31 deletions vllm/model_executor/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
get_clip_num_patches)
from .interfaces import SupportsMultiModal
from .utils import (filter_weights, init_vllm_registered_model,
from .utils import (filter_weights, flatten_bn, init_vllm_registered_model,
merge_multimodal_embeddings)

IMG_START = '<img>'
Expand All @@ -42,19 +42,17 @@

class InternVLImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: Union[torch.Tensor, List[torch.Tensor]]
data: torch.Tensor
"""
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
Note that `num_patches` may be different for each batch, in which case
the data is passed as a list instead of a batched tensor.
Shape:
`(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
"""


class InternVLImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: Union[torch.Tensor, List[torch.Tensor]]
"""Shape: `(batch_size, image_feature_size, hidden_size)`
data: torch.Tensor
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down Expand Up @@ -357,7 +355,7 @@ def pixel_shuffle(self, x, scale_factor=0.5):
x = x.permute(0, 2, 1, 3).contiguous()
return x

def extract_feature(self, pixel_values):
def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
vit_embeds = self.vision_model(pixel_values=pixel_values)
vit_embeds = vit_embeds[:, 1:, :]

Expand All @@ -370,17 +368,7 @@ def extract_feature(self, pixel_values):
vit_embeds = self.mlp1(vit_embeds)
return vit_embeds

def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
if list(data.shape[1:]) != [2]:
raise ValueError(
f"The expected image sizes shape is batch dimension plus "
f"{[2]}. You supplied {data.shape}.")

return data

def _validate_pixel_values(
self, data: Union[torch.Tensor, List[torch.Tensor]]
) -> Union[torch.Tensor, List[torch.Tensor]]:
def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:

h = w = self.config.vision_config.image_size
expected_dims = (3, h, w)
Expand All @@ -389,10 +377,11 @@ def _validate_shape(d: torch.Tensor):
actual_dims = tuple(d.shape)

if actual_dims != expected_dims:
expected_expr = ("num_patches", *map(str, expected_dims))
expected_expr = str(expected_dims)
raise ValueError(
"The expected shape of pixel values in each batch element "
f"is {expected_expr}. You supplied {tuple(d.shape)}.")
"The expected shape of pixel values per image per batch "
f" per patch is {expected_expr}. "
f"You supplied {tuple(d.shape)}.")

for d in data:
_validate_shape(d)
Expand All @@ -413,12 +402,9 @@ def _parse_and_validate_image_input(
raise ValueError("Incorrect type of image embeddings. "
f"Got type: {type(image_embeds)}")

# Flatten the B and N dimensions
image_embeds = image_embeds.flatten(0, 2)

return InternVLImageEmbeddingInputs(
type="image_embeds",
data=image_embeds,
data=flatten_bn(image_embeds),
)

self.img_context_token_id = image_token_id[0]
Expand All @@ -428,12 +414,10 @@ def _parse_and_validate_image_input(
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")

# Flatten the B and N dimensions
pixel_values = pixel_values.flatten(0, 2)

return InternVLImagePixelInputs(
type="pixel_values",
data=self._validate_pixel_values(pixel_values),
data=self._validate_pixel_values(
flatten_bn(pixel_values, concat=True).flatten(0, 1)),
)

raise AssertionError("This line should be unreachable.")
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@
class LlavaImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: `(batch_size, num_channels, height, width)`"""
"""Shape: `(batch_size * num_images, num_channels, height, width)`"""


class LlavaImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: torch.Tensor
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down
52 changes: 26 additions & 26 deletions vllm/model_executor/models/llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
dummy_seq_data_for_siglip, get_siglip_image_feature_size,
get_siglip_patch_grid_length, input_processor_for_siglip)
from .utils import (filter_weights, init_vllm_registered_model,
from .utils import (filter_weights, flatten_bn, init_vllm_registered_model,
merge_multimodal_embeddings)

logger = init_logger(__name__)
Expand All @@ -47,15 +47,16 @@ class LlavaNextImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: Union[torch.Tensor, List[torch.Tensor]]
"""
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
Shape:
`(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
Note that `num_patches` may be different for each batch, in which case
the data is passed as a list instead of a batched tensor.
Note that `num_patches` may be different per batch and image,
in which case the data is passed as a list instead of a batched tensor.
"""

image_sizes: NotRequired[torch.Tensor]
"""
Shape: `(batch_size, 2)`
Shape: `(batch_size * num_images, 2)`
This should be in `(height, width)` format.
"""
Expand All @@ -64,7 +65,7 @@ class LlavaNextImagePixelInputs(TypedDict):
class LlavaNextImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: torch.Tensor
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down Expand Up @@ -315,10 +316,19 @@ def __init__(self,
torch.empty(config.text_config.hidden_size))

def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
if list(data.shape[1:]) != [2]:
raise ValueError(
f"The expected image sizes shape is batch dimension plus "
f"{[2]}. You supplied {data.shape}.")
expected_dims = (2, )

def _validate_shape(d: torch.Tensor):
actual_dims = tuple(d.shape)

if actual_dims != expected_dims:
expected_expr = str(expected_dims)
raise ValueError(
f"The expected shape of image sizes per image per batch "
f"is {expected_expr}. You supplied {tuple(d.shape)}.")

for d in data:
_validate_shape(d)

return data

Expand All @@ -335,7 +345,7 @@ def _validate_shape(d: torch.Tensor):
if actual_dims != expected_dims:
expected_expr = ("num_patches", *map(str, expected_dims))
raise ValueError(
"The expected shape of pixel values in each batch element "
"The expected shape of pixel values per image per batch "
f"is {expected_expr}. You supplied {tuple(d.shape)}.")

for d in data:
Expand All @@ -357,35 +367,25 @@ def _parse_and_validate_image_input(
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")

if not isinstance(image_sizes, torch.Tensor):
if not isinstance(image_sizes, (torch.Tensor, list)):
raise ValueError("Incorrect type of image sizes. "
f"Got type: {type(image_sizes)}")

# Remove the N dimension until multiple images are supported.
if isinstance(pixel_values, torch.Tensor):
pixel_values = pixel_values.squeeze(1)
else:
pixel_values = [t.squeeze(0) for t in pixel_values]

image_sizes = image_sizes.squeeze(1)

return LlavaNextImagePixelInputs(
type="pixel_values",
data=self._validate_pixel_values(pixel_values),
image_sizes=self._validate_image_sizes(image_sizes),
data=self._validate_pixel_values(flatten_bn(pixel_values)),
image_sizes=self._validate_image_sizes(
flatten_bn(image_sizes, concat=True)),
)

if image_embeds is not None:
if not isinstance(image_embeds, torch.Tensor):
raise ValueError("Incorrect type of image embeds. "
f"Got type: {type(image_embeds)}")

# Remove the N dimension until multiple images are supported.
image_embeds = image_embeds.squeeze(1)

return LlavaNextImageEmbeddingInputs(
type="image_embeds",
data=image_embeds,
data=flatten_bn(image_embeds),
)

raise AssertionError("This line should be unreachable.")
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/paligemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,13 @@
class PaliGemmaImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: (batch_size, num_channels, height, width)"""
"""Shape: `(batch_size * num_images, num_channels, height, width)`"""


class PaliGemmaImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: torch.Tensor
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down
50 changes: 27 additions & 23 deletions vllm/model_executor/models/phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
from .interfaces import SupportsMultiModal
from .utils import merge_multimodal_embeddings
from .utils import flatten_bn, merge_multimodal_embeddings

logger = init_logger(__name__)

Expand Down Expand Up @@ -75,15 +75,16 @@ class Phi3VImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: Union[torch.Tensor, List[torch.Tensor]]
"""
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
Shape:
`(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
Note that `num_patches` may be different for each batch, in which case
the data is passed as a list instead of a batched tensor.
Note that `num_patches` may be different per batch and image,
in which case the data is passed as a list instead of a batched tensor.
"""

image_sizes: torch.Tensor
"""
Shape: `(batch_size, 2)`
Shape: `(batch_size * num_images, 2)`
This should be in `(height, width)` format.
"""
Expand All @@ -92,7 +93,7 @@ class Phi3VImagePixelInputs(TypedDict):
class Phi3VImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: Union[torch.Tensor, List[torch.Tensor]]
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down Expand Up @@ -511,10 +512,19 @@ def __init__(self,
self.sampler = Sampler()

def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
if list(data.shape[1:]) != [2]:
raise ValueError(
f"The expected shape of image sizes is batch dimension plus "
f"{[2]}. You supplied {tuple(data.shape)}.")
expected_dims = (2, )

def _validate_shape(d: torch.Tensor):
actual_dims = tuple(d.shape)

if actual_dims != expected_dims:
expected_expr = str(expected_dims)
raise ValueError(
f"The expected shape of image sizes per image per batch "
f"is {expected_expr}. You supplied {tuple(d.shape)}.")

for d in data:
_validate_shape(d)

return data

Expand All @@ -531,7 +541,7 @@ def _validate_shape(d: torch.Tensor):
if actual_dims != expected_dims:
expected_expr = ("num_patches", *map(str, expected_dims))
raise ValueError(
"The expected shape of pixel values in each batch element "
"The expected shape of pixel values per image per batch "
f"is {expected_expr}. You supplied {tuple(d.shape)}.")

for d in data:
Expand All @@ -556,30 +566,24 @@ def _parse_and_validate_image_input(
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")

if not isinstance(image_sizes, torch.Tensor):
if not isinstance(image_sizes, (torch.Tensor, list)):
raise ValueError("Incorrect type of image sizes. "
f"Got type: {type(image_sizes)}")

# Merge the B and N dimensions.
if isinstance(pixel_values, torch.Tensor):
pixel_values = pixel_values.flatten(0, 1)
else:
pixel_values = torch.cat(pixel_values)

image_sizes = image_sizes.flatten(0, 1)

return Phi3VImagePixelInputs(
type="pixel_values",
data=self._validate_pixel_values(pixel_values),
image_sizes=self._validate_image_sizes(image_sizes))
data=self._validate_pixel_values(flatten_bn(pixel_values)),
image_sizes=self._validate_image_sizes(
flatten_bn(image_sizes, concat=True)))

if image_embeds is not None:
if not isinstance(image_embeds, torch.Tensor):
raise ValueError("Incorrect type of image embeddings. "
f"Got type: {type(image_embeds)}")

return Phi3VImageEmbeddingInputs(
type="image_embeds",
data=image_embeds,
data=flatten_bn(image_embeds),
)

raise AssertionError("This line should be unreachable.")
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
class UltravoxAudioFeatureInputs(TypedDict):
type: Literal["audio_features"]
data: Union[torch.Tensor, List[torch.Tensor]]
"""Shape: `(batch_size, 80, M)"""
"""Shape: `(batch_size * num_audios, 80, M)"""


class UltravoxAudioEmbeddingInputs(TypedDict):
Expand Down
Loading

0 comments on commit ef9baee

Please sign in to comment.