vllm-project · simon-mo · Dec 18, 2024 · Dec 9, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -221,8 +221,12 @@ steps:
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/model_executor/layers
+  - vllm/model_executor/guided_decoding
   - tests/test_logits_processor
-  command: pytest -v -s test_logits_processor.py
+  - tests/model_executor/test_guided_processors
+  commands: 
+    - pytest -v -s test_logits_processor.py
+    - pytest -v -s model_executor/test_guided_processors.py
 
 - label: Speculative decoding tests # 30min
   source_file_dependencies:

@@ -3,17 +3,20 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import copy
+import json
 
+import jsonschema
+import jsonschema.exceptions
 import pytest
 
-from vllm import SamplingParams
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
     MistralToolParser)
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 from ...utils import check_logprobs_close
 
 MODELS = [
-    "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mistral-7B-Instruct-v0.3",
 ]
 
 MISTRAL_FORMAT_MODELS = [
@@ -126,6 +129,45 @@
     }
 ]
 
+SAMPLE_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work_history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "number"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work_history"]
+}
+
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -251,3 +293,42 @@ def test_mistral_function_calling(
         assert parsed_message.tool_calls[
             0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
         assert parsed_message.content is None
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("guided_backend",
+                         ["outlines", "lm-format-enforcer", "xgrammar"])
+def test_mistral_guided_decoding(
+    vllm_runner,
+    model: str,
+    guided_backend: str,
+) -> None:
+    with vllm_runner(model, tokenizer_mode="mistral") as vllm_model:
+
+        guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
+                                               backend=guided_backend)
+        params = SamplingParams(max_tokens=512,
+                                temperature=0.7,
+                                guided_decoding=guided_decoding)
+
+        messages = [{
+            "role": "system",
+            "content": "you are a helpful assistant"
+        }, {
+            "role":
+            "user",
+            "content":
+            f"Give an example JSON for an employee profile that "
+            f"fits this schema: {SAMPLE_JSON_SCHEMA}"
+        }]
+        outputs = vllm_model.model.chat(messages, sampling_params=params)
+
+        generated_text = outputs[0].outputs[0].text
+        json_response = json.loads(generated_text)
+        assert outputs is not None
+
+        try:
+            jsonschema.validate(instance=json_response,
+                                schema=SAMPLE_JSON_SCHEMA)
+        except jsonschema.exceptions.ValidationError:
+            pytest.fail("Generated response is not valid with JSON schema")
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -551,9 +551,7 @@ async def build_guided_decoding_logits_processor_async(
     guided_decoding.backend = guided_decoding.backend or default_guided_backend
 
     processor = await get_guided_decoding_logits_processor(
-        guided_params=guided_decoding,
-        tokenizer=tokenizer,
-        model_config=model_config)
+        guided_params=guided_decoding, tokenizer=tokenizer)
 
     if processor:
         if sampling_params.logits_processors is None:

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -2008,9 +2008,7 @@ def _build_logits_processors(
                 self.decoding_config.guided_decoding_backend
 
             processor = get_local_guided_decoding_logits_processor(
-                guided_params=guided_decoding,
-                tokenizer=tokenizer,
-                model_config=self.model_config)
+                guided_params=guided_decoding, tokenizer=tokenizer)
             if processor:
                 logits_processors.append(processor)
 

diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import asyncio
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
@@ -8,7 +9,6 @@
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
 
-    from vllm.config import ModelConfig
     from vllm.logits_process import LogitsProcessor
     from vllm.sampling_params import GuidedDecodingParams
 
@@ -86,8 +86,8 @@ def maybe_backend_fallback(
 
 
 async def get_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig) -> LogitsProcessor | None:
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizer) -> LogitsProcessor | None:
     guided_params = maybe_backend_fallback(guided_params)
     # CFG grammar not supported by LMFE, so we use outlines instead
     if guided_params.backend == 'outlines':
@@ -105,35 +105,18 @@ async def get_guided_decoding_logits_processor(
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
-            guided_params, tokenizer, model_config)
+            guided_params, tokenizer)
 
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
         "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
 
 
 def get_local_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig) -> LogitsProcessor | None:
-    guided_params = maybe_backend_fallback(guided_params)
-    # CFG grammar not supported by LMFE, so we use outlines instead
-    if guided_params.backend == 'outlines':
-        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
-        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
-            get_local_outlines_guided_decoding_logits_processor)
-        return get_local_outlines_guided_decoding_logits_processor(
-            guided_params, tokenizer)
-    if guided_params.backend == 'lm-format-enforcer':
-        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
-            get_local_lm_format_enforcer_guided_decoding_logits_processor)
-        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
-            guided_params, tokenizer)
-    if guided_params.backend == 'xgrammar':
-        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
-            get_local_xgrammar_guided_decoding_logits_processor)
-        return get_local_xgrammar_guided_decoding_logits_processor(
-            guided_params, tokenizer, model_config)
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizer) -> LogitsProcessor | None:
 
-    raise ValueError(
-        f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
+    loop = asyncio.get_event_loop()
+    f = get_guided_decoding_logits_processor(guided_params, tokenizer)
+    res = loop.run_until_complete(f)
+    return res
@@ -16,22 +16,20 @@
 
 from vllm.model_executor.guided_decoding.xgrammar_utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark)
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
 
-    from vllm.config import ModelConfig
     from vllm.sampling_params import GuidedDecodingParams
 
 
 # TODO: passing batch size to max threads here
 def get_local_xgrammar_guided_decoding_logits_processor(
         guided_params: GuidedDecodingParams,
         tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig,
         max_threads: int = 8):
     config = GrammarConfig.from_guided_params(guided_params=guided_params,
-                                              model_config=model_config,
                                               tokenizer=tokenizer,
                                               max_threads=max_threads)
     return XGrammarLogitsProcessor(config)
@@ -41,7 +39,8 @@ class TokenizerData(NamedTuple):
     """Immutable container for cached tokenizer data."""
     encoded_vocab: list[str]
     stop_token_ids: list[int] | None
-    backend_str: str
+    backend_str: str | None
+    vocab_type: xgr.VocabType | None
 
 
 class TokenizerDataCache:
@@ -68,18 +67,26 @@ def get_tokenizer_data(cls,
                     "get_vocab method.") from e
 
             stop_token_ids = None
-            backend_str = xgr.VocabType.RAW
+            backend_str = ""
+            vocab_type = xgr.VocabType.RAW
+
+            if stop_token_ids is None and hasattr(
+                    tokenizer,
+                    "eos_token_id") and tokenizer.eos_token_id is not None:
+                stop_token_ids = [tokenizer.eos_token_id]
+
             if isinstance(tokenizer, PreTrainedTokenizerFast):
                 backend_str = tokenizer.backend_tokenizer.to_str()
-                if stop_token_ids is None and hasattr(
-                        tokenizer,
-                        "eos_token_id") and tokenizer.eos_token_id is not None:
-                    stop_token_ids = [tokenizer.eos_token_id]
+
+            elif isinstance(tokenizer, MistralTokenizer):
+                # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                vocab_type = xgr.VocabType.BYTE_FALLBACK
 
             cls._cache[tokenizer_hash] = TokenizerData(
                 encoded_vocab=encoded_vocab,
                 stop_token_ids=stop_token_ids,
-                backend_str=backend_str)
+                backend_str=backend_str,
+                vocab_type=vocab_type)
 
         return cls._cache[tokenizer_hash]
 
@@ -99,10 +106,18 @@ def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
 
         if cache_key not in cls._cache:
             assert config.encoded_vocab is not None
-            tokenizer_info = xgr.TokenizerInfo._create_from_handle(
-                xgr_core.TokenizerInfo.from_huggingface(
-                    config.encoded_vocab, config.backend_str,
-                    config.vocab_size, config.stop_token_ids))
+
+            if config.backend_str:
+                tokenizer_info = xgr.TokenizerInfo._create_from_handle(
+                    xgr_core.TokenizerInfo.from_huggingface(
+                        config.encoded_vocab, config.backend_str,
+                        config.vocab_size, config.stop_token_ids))
+            else:
+                tokenizer_info = xgr.TokenizerInfo(
+                    config.encoded_vocab,
+                    config.vocab_type,
+                    vocab_size=config.vocab_size,
+                    stop_token_ids=config.stop_token_ids)
             cls._cache[cache_key] = xgr.GrammarCompiler(
                 tokenizer_info, max_threads=config.max_threads)
 
@@ -122,11 +137,11 @@ class GrammarConfig:
     encoded_vocab: list[str] | None = None
     stop_token_ids: list[int] | None = None
     backend_str: str | None = None
+    vocab_type: xgr.VocabType = xgr.VocabType.RAW
 
     @classmethod
     def from_guided_params(cls,
                            guided_params: GuidedDecodingParams,
-                           model_config: ModelConfig,
                            tokenizer: PreTrainedTokenizer,
                            max_threads: int = 8) -> GrammarConfig:
 
@@ -136,24 +151,27 @@ def from_guided_params(cls,
             encoded_vocab = None
             stop_token_ids = None
             backend_str = None
+            vocab_type = xgr.VocabType.RAW
         else:
             tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
             encoded_vocab = tokenizer_data.encoded_vocab
             stop_token_ids = tokenizer_data.stop_token_ids
             backend_str = tokenizer_data.backend_str
+            vocab_type = tokenizer_data.vocab_type
 
         if guided_params.json:
             if not isinstance(guided_params.json, str):
                 json_str = json.dumps(guided_params.json)
             else:
                 json_str = guided_params.json
             return cls(json_str=json_str,
-                       vocab_size=model_config.hf_config.vocab_size,
+                       vocab_size=tokenizer.vocab_size,
                        encoded_vocab=encoded_vocab,
                        stop_token_ids=stop_token_ids,
                        backend_str=backend_str,
                        tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+                       max_threads=max_threads,
+                       vocab_type=vocab_type)
         elif guided_params.grammar:
             # XGrammar only supports GBNF grammars, so we must convert Lark
             if grammar_is_likely_lark(guided_params.grammar):
@@ -168,20 +186,22 @@ def from_guided_params(cls,
             else:
                 grammar_str = guided_params.grammar
             return cls(grammar_str=grammar_str,
-                       vocab_size=model_config.hf_config.vocab_size,
+                       vocab_size=tokenizer.vocab_size,
                        encoded_vocab=encoded_vocab,
                        stop_token_ids=stop_token_ids,
                        backend_str=backend_str,
                        tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+                       max_threads=max_threads,
+                       vocab_type=vocab_type)
         elif guided_params.json_object:
             return cls(json_object=True,
-                       vocab_size=model_config.hf_config.vocab_size,
+                       vocab_size=tokenizer.vocab_size,
                        encoded_vocab=encoded_vocab,
                        stop_token_ids=stop_token_ids,
                        backend_str=backend_str,
                        tokenizer_hash=tokenizer_hash,
-                       max_threads=max_threads)
+                       max_threads=max_threads,
+                       vocab_type=vocab_type)
         else:
             raise ValueError(
                 "Currently only support JSON and EBNF grammar mode for xgrammar"
@@ -257,10 +277,14 @@ def __call__(self, input_ids: list[int],
         # fill_next_token_bitmask so we move it to the device of scores
         device_type = scores.device.type
         if device_type != "cuda":
-            scores = scores.to("cpu")
+            scores = scores.to("cpu").unsqueeze(0)
+
+        # Note: In this method, if the tensors have different dimensions
+        # on CPU device fails, but on GPU it runs without error. Hence the
+        # unsqueeze above for scores, to match the token bitmask shape
         xgr.apply_token_bitmask_inplace(scores,
                                         self.token_bitmask.to(scores.device))
         if device_type != "cuda":
-            scores = scores.to(device_type)
+            scores = scores.to(device_type).squeeze()
 
         return scores