Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bugfix] Fix guided decoding with tokenizer mode mistral #11046

Merged
6 changes: 5 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,12 @@ steps:
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/model_executor/layers
- vllm/model_executor/guided_decoding
- tests/test_logits_processor
command: pytest -v -s test_logits_processor.py
- tests/model_executor/test_guided_processors
commands:
- pytest -v -s test_logits_processor.py
- pytest -v -s model_executor/test_guided_processors.py

- label: Speculative decoding tests # 30min
source_file_dependencies:
Expand Down
85 changes: 83 additions & 2 deletions tests/models/decoder_only/language/test_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,20 @@
Run `pytest tests/models/test_mistral.py`.
"""
import copy
import json

import jsonschema
import jsonschema.exceptions
import pytest

from vllm import SamplingParams
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( # noqa
MistralToolParser)
from vllm.sampling_params import GuidedDecodingParams, SamplingParams

from ...utils import check_logprobs_close

MODELS = [
"mistralai/Mistral-7B-Instruct-v0.1",
"mistralai/Mistral-7B-Instruct-v0.3",
]

MISTRAL_FORMAT_MODELS = [
Expand Down Expand Up @@ -126,6 +129,45 @@
}
]

SAMPLE_JSON_SCHEMA = {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"age": {
"type": "integer"
},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string"
},
"duration": {
"type": "number"
},
"position": {
"type": "string"
}
},
"required": ["company", "position"]
}
}
},
"required": ["name", "age", "skills", "work_history"]
}


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
Expand Down Expand Up @@ -251,3 +293,42 @@ def test_mistral_function_calling(
assert parsed_message.tool_calls[
0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}' # noqa
assert parsed_message.content is None


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("guided_backend",
["outlines", "lm-format-enforcer", "xgrammar"])
def test_mistral_guided_decoding(
vllm_runner,
model: str,
guided_backend: str,
) -> None:
with vllm_runner(model, tokenizer_mode="mistral") as vllm_model:

guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
backend=guided_backend)
params = SamplingParams(max_tokens=512,
temperature=0.7,
guided_decoding=guided_decoding)

messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role":
"user",
"content":
f"Give an example JSON for an employee profile that "
f"fits this schema: {SAMPLE_JSON_SCHEMA}"
}]
outputs = vllm_model.model.chat(messages, sampling_params=params)

generated_text = outputs[0].outputs[0].text
json_response = json.loads(generated_text)
assert outputs is not None

try:
jsonschema.validate(instance=json_response,
schema=SAMPLE_JSON_SCHEMA)
except jsonschema.exceptions.ValidationError:
pytest.fail("Generated response is not valid with JSON schema")
4 changes: 1 addition & 3 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,9 +551,7 @@ async def build_guided_decoding_logits_processor_async(
guided_decoding.backend = guided_decoding.backend or default_guided_backend

processor = await get_guided_decoding_logits_processor(
guided_params=guided_decoding,
tokenizer=tokenizer,
model_config=model_config)
guided_params=guided_decoding, tokenizer=tokenizer)

if processor:
if sampling_params.logits_processors is None:
Expand Down
4 changes: 1 addition & 3 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2008,9 +2008,7 @@ def _build_logits_processors(
self.decoding_config.guided_decoding_backend

processor = get_local_guided_decoding_logits_processor(
guided_params=guided_decoding,
tokenizer=tokenizer,
model_config=self.model_config)
guided_params=guided_decoding, tokenizer=tokenizer)
if processor:
logits_processors.append(processor)

Expand Down
37 changes: 10 additions & 27 deletions vllm/model_executor/guided_decoding/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import asyncio
from typing import TYPE_CHECKING

from vllm.logger import init_logger
Expand All @@ -8,7 +9,6 @@
if TYPE_CHECKING:
from transformers import PreTrainedTokenizer

from vllm.config import ModelConfig
from vllm.logits_process import LogitsProcessor
from vllm.sampling_params import GuidedDecodingParams

Expand Down Expand Up @@ -86,8 +86,8 @@ def maybe_backend_fallback(


async def get_guided_decoding_logits_processor(
guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
model_config: ModelConfig) -> LogitsProcessor | None:
guided_params: GuidedDecodingParams,
tokenizer: PreTrainedTokenizer) -> LogitsProcessor | None:
guided_params = maybe_backend_fallback(guided_params)
# CFG grammar not supported by LMFE, so we use outlines instead
if guided_params.backend == 'outlines':
Expand All @@ -105,35 +105,18 @@ async def get_guided_decoding_logits_processor(
from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa
get_local_xgrammar_guided_decoding_logits_processor)
return get_local_xgrammar_guided_decoding_logits_processor(
guided_params, tokenizer, model_config)
guided_params, tokenizer)

raise ValueError(
f"Unknown guided decoding backend '{guided_params.backend}'. "
"Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")


def get_local_guided_decoding_logits_processor(
guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
model_config: ModelConfig) -> LogitsProcessor | None:
guided_params = maybe_backend_fallback(guided_params)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you revert this change?

This is being used for offline use case, with LLM, where as get_guided_decoding_logit_processor is being used for online usecase.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reviewed what I did and checked that it was not so good based on the difference of implementation of the methods get_local_outlines_guided_decoding_logits_processor and get_outlines_guided_decoding_logits_processor. But I tried something a little bit difference to not revert everything, just to avoid code duplication. See if you agree, if not I won't insist I can revert with no problem. Also I updated the tests to check the offline and online version to pass all over these code paths, considering the offline path.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is even more confusing now that there are three functions. I would prefer a revert as it seems you have no other changes to this file? We can consider refactor in another PR

# CFG grammar not supported by LMFE, so we use outlines instead
if guided_params.backend == 'outlines':
# NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa
get_local_outlines_guided_decoding_logits_processor)
return get_local_outlines_guided_decoding_logits_processor(
guided_params, tokenizer)
if guided_params.backend == 'lm-format-enforcer':
from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import ( # noqa
get_local_lm_format_enforcer_guided_decoding_logits_processor)
return get_local_lm_format_enforcer_guided_decoding_logits_processor(
guided_params, tokenizer)
if guided_params.backend == 'xgrammar':
from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa
get_local_xgrammar_guided_decoding_logits_processor)
return get_local_xgrammar_guided_decoding_logits_processor(
guided_params, tokenizer, model_config)
guided_params: GuidedDecodingParams,
tokenizer: PreTrainedTokenizer) -> LogitsProcessor | None:

raise ValueError(
f"Unknown guided decoding backend '{guided_params.backend}'. "
"Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
loop = asyncio.get_event_loop()
f = get_guided_decoding_logits_processor(guided_params, tokenizer)
res = loop.run_until_complete(f)
return res
70 changes: 47 additions & 23 deletions vllm/model_executor/guided_decoding/xgrammar_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,20 @@

from vllm.model_executor.guided_decoding.xgrammar_utils import (
convert_lark_to_gbnf, grammar_is_likely_lark)
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer

if TYPE_CHECKING:
from transformers import PreTrainedTokenizer

from vllm.config import ModelConfig
from vllm.sampling_params import GuidedDecodingParams


# TODO: passing batch size to max threads here
def get_local_xgrammar_guided_decoding_logits_processor(
guided_params: GuidedDecodingParams,
tokenizer: PreTrainedTokenizer,
model_config: ModelConfig,
max_threads: int = 8):
config = GrammarConfig.from_guided_params(guided_params=guided_params,
model_config=model_config,
tokenizer=tokenizer,
max_threads=max_threads)
return XGrammarLogitsProcessor(config)
Expand All @@ -41,7 +39,8 @@ class TokenizerData(NamedTuple):
"""Immutable container for cached tokenizer data."""
encoded_vocab: list[str]
stop_token_ids: list[int] | None
backend_str: str
backend_str: str | None
vocab_type: xgr.VocabType | None


class TokenizerDataCache:
Expand All @@ -68,18 +67,26 @@ def get_tokenizer_data(cls,
"get_vocab method.") from e

stop_token_ids = None
backend_str = xgr.VocabType.RAW
backend_str = ""
vocab_type = xgr.VocabType.RAW

if stop_token_ids is None and hasattr(
tokenizer,
"eos_token_id") and tokenizer.eos_token_id is not None:
stop_token_ids = [tokenizer.eos_token_id]

if isinstance(tokenizer, PreTrainedTokenizerFast):
backend_str = tokenizer.backend_tokenizer.to_str()
if stop_token_ids is None and hasattr(
tokenizer,
"eos_token_id") and tokenizer.eos_token_id is not None:
stop_token_ids = [tokenizer.eos_token_id]

elif isinstance(tokenizer, MistralTokenizer):
# REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
vocab_type = xgr.VocabType.BYTE_FALLBACK

cls._cache[tokenizer_hash] = TokenizerData(
encoded_vocab=encoded_vocab,
stop_token_ids=stop_token_ids,
backend_str=backend_str)
backend_str=backend_str,
vocab_type=vocab_type)

return cls._cache[tokenizer_hash]

Expand All @@ -99,10 +106,18 @@ def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:

if cache_key not in cls._cache:
assert config.encoded_vocab is not None
tokenizer_info = xgr.TokenizerInfo._create_from_handle(
xgr_core.TokenizerInfo.from_huggingface(
config.encoded_vocab, config.backend_str,
config.vocab_size, config.stop_token_ids))

if config.backend_str:
tokenizer_info = xgr.TokenizerInfo._create_from_handle(
xgr_core.TokenizerInfo.from_huggingface(
config.encoded_vocab, config.backend_str,
config.vocab_size, config.stop_token_ids))
else:
tokenizer_info = xgr.TokenizerInfo(
config.encoded_vocab,
config.vocab_type,
vocab_size=config.vocab_size,
stop_token_ids=config.stop_token_ids)
cls._cache[cache_key] = xgr.GrammarCompiler(
tokenizer_info, max_threads=config.max_threads)

Expand All @@ -122,11 +137,11 @@ class GrammarConfig:
encoded_vocab: list[str] | None = None
stop_token_ids: list[int] | None = None
backend_str: str | None = None
vocab_type: xgr.VocabType = xgr.VocabType.RAW

@classmethod
def from_guided_params(cls,
guided_params: GuidedDecodingParams,
model_config: ModelConfig,
tokenizer: PreTrainedTokenizer,
max_threads: int = 8) -> GrammarConfig:

Expand All @@ -136,24 +151,27 @@ def from_guided_params(cls,
encoded_vocab = None
stop_token_ids = None
backend_str = None
vocab_type = xgr.VocabType.RAW
else:
tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
encoded_vocab = tokenizer_data.encoded_vocab
stop_token_ids = tokenizer_data.stop_token_ids
backend_str = tokenizer_data.backend_str
vocab_type = tokenizer_data.vocab_type

if guided_params.json:
if not isinstance(guided_params.json, str):
json_str = json.dumps(guided_params.json)
else:
json_str = guided_params.json
return cls(json_str=json_str,
vocab_size=model_config.hf_config.vocab_size,
vocab_size=tokenizer.vocab_size,
encoded_vocab=encoded_vocab,
stop_token_ids=stop_token_ids,
backend_str=backend_str,
tokenizer_hash=tokenizer_hash,
max_threads=max_threads)
max_threads=max_threads,
vocab_type=vocab_type)
elif guided_params.grammar:
# XGrammar only supports GBNF grammars, so we must convert Lark
if grammar_is_likely_lark(guided_params.grammar):
Expand All @@ -168,20 +186,22 @@ def from_guided_params(cls,
else:
grammar_str = guided_params.grammar
return cls(grammar_str=grammar_str,
vocab_size=model_config.hf_config.vocab_size,
vocab_size=tokenizer.vocab_size,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@aarnphm there is a reason why we needed to reference the model's vocab size and not the tokenizers, correct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, that was a shot, I knew that someone could clarify this. I have already have a conflict on this part #11043 recently merged so I will revert this change. But would be nice to have documented the difference of these vocab sizes. @aarnphm do you have any reference for that?

Copy link
Contributor Author

@wallashss wallashss Dec 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for check it out @mgoin! I really appreciate how fast you answered to this.

Also the backend_str/vocab_type logic is a little confusing now.

Yeah, but the problem of this part is it was not correct. I reported a traceback on #11045.

from_huggingface(): incompatible function arguments. The following argument types are supported: (arg0: list[str], arg1: str, arg2: Optional[int], arg3: Optional[list[int]]) -> xgrammar.xgrammar_bindings.TokenizerInfo

First, backend_str is typed with str and was assigned with a Enum value (VocabType). And this function from_huggingface does not receive vocabType, and that's why it crashes using Mistral Tokenizer and possibly other types of tokenizers as well. So, for these cases we have call the contructor of TokenizerInfo and pass a vocabType (and no backend_str), which may be RAW for tokenizer like tikToken and BYTE_FALLBACK for mistral. I think maybe I can make it more explicit, first time I wrote a solution for that I added a bool field on TokenizerData like is_from_huggingface, but I decided to make it more concise and infer that based on the value of these fields (which must be mutually exclusive). Of course I can add more comments to clarify and add asserts to prevent wrong behavior.

What do you think?

Copy link
Contributor

@aarnphm aarnphm Dec 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to reference tokenizer vocab size because of additional padding tokens.

this is the thread https://vllm-dev.slack.com/archives/C07QQ8DAXMK/p1732673561777159

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tks! I reverted the code.

encoded_vocab=encoded_vocab,
stop_token_ids=stop_token_ids,
backend_str=backend_str,
tokenizer_hash=tokenizer_hash,
max_threads=max_threads)
max_threads=max_threads,
vocab_type=vocab_type)
elif guided_params.json_object:
return cls(json_object=True,
vocab_size=model_config.hf_config.vocab_size,
vocab_size=tokenizer.vocab_size,
encoded_vocab=encoded_vocab,
stop_token_ids=stop_token_ids,
backend_str=backend_str,
tokenizer_hash=tokenizer_hash,
max_threads=max_threads)
max_threads=max_threads,
vocab_type=vocab_type)
else:
raise ValueError(
"Currently only support JSON and EBNF grammar mode for xgrammar"
Expand Down Expand Up @@ -257,10 +277,14 @@ def __call__(self, input_ids: list[int],
# fill_next_token_bitmask so we move it to the device of scores
device_type = scores.device.type
if device_type != "cuda":
scores = scores.to("cpu")
scores = scores.to("cpu").unsqueeze(0)

# Note: In this method, if the tensors have different dimensions
# on CPU device fails, but on GPU it runs without error. Hence the
# unsqueeze above for scores, to match the token bitmask shape
xgr.apply_token_bitmask_inplace(scores,
self.token_bitmask.to(scores.device))
if device_type != "cuda":
scores = scores.to(device_type)
scores = scores.to(device_type).squeeze()

return scores
Loading
Loading