diff --git a/tests/conftest.py b/tests/conftest.py
index 67885b93285c5..79846bb024dba 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,14 +11,15 @@
 from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
                           AutoProcessor, AutoTokenizer, BatchEncoding)
 
-from vllm import LLM, SamplingParams
 from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
 from vllm.distributed import (destroy_distributed_environment,
                               destroy_model_parallel)
+from vllm.entrypoints.llm import LLM
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalData
 from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+from vllm.sampling_params import SamplingParams
 from vllm.sequence import SampleLogprobs
 from vllm.utils import cuda_device_count_stateless, is_cpu
 
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
new file mode 100644
index 0000000000000..948778b2c22ed
--- /dev/null
+++ b/tests/entrypoints/conftest.py
@@ -0,0 +1,72 @@
+import pytest
+
+
+@pytest.fixture
+def sample_regex():
+    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "maxLength": 10
+                },
+                "minItems": 3
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {
+                            "type": "string"
+                        },
+                        "duration": {
+                            "type": "number"
+                        },
+                        "position": {
+                            "type": "string"
+                        }
+                    },
+                    "required": ["company", "position"]
+                }
+            }
+        },
+        "required": ["name", "age", "skills", "work_history"]
+    }
+
+
+@pytest.fixture
+def sample_guided_choice():
+    return [
+        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
+        "Ruby", "Swift", "Kotlin"
+    ]
+
+
+@pytest.fixture
+def sample_sql_statements():
+    return ("""
+start: select_statement
+
+select_statement: "SELECT" column "from" table "where" condition
+
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+
+number: "1" | "2"
+""")
diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py
index fb32a9d155bc0..a8792d7ae9737 100644
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
@@ -4,67 +4,22 @@
 import torch
 from transformers import AutoTokenizer
 
-from vllm.entrypoints.openai.protocol import CompletionRequest
 from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
+    GuidedDecodingFields, get_guided_decoding_logits_processor)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     JSONLogitsProcessor, RegexLogitsProcessor)
 
-TEST_SCHEMA = {
-    "type": "object",
-    "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
-        "skills": {
-            "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
-        },
-        "work history": {
-            "type": "array",
-            "items": {
-                "type": "object",
-                "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "string"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
-                },
-                "required": ["company", "position"]
-            }
-        }
-    },
-    "required": ["name", "age", "skills", "work history"]
-}
 
-TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-pytestmark = pytest.mark.openai
-
-
-def test_guided_logits_processors():
+def test_guided_logits_processors(sample_regex, sample_json_schema):
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
     tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
-    regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer)
-    json_LP = JSONLogitsProcessor(TEST_SCHEMA,
+    regex_LP = RegexLogitsProcessor(sample_regex, tokenizer)
+    json_LP = JSONLogitsProcessor(sample_json_schema,
                                   tokenizer,
                                   whitespace_pattern=None)
 
     token_ids = tokenizer.encode(
-        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+        f"Give an example IPv4 address with this regex: {sample_regex}")
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
     regex_LP(token_ids, tensor)
@@ -72,7 +27,8 @@ def test_guided_logits_processors():
     assert not torch.allclose(tensor, original_tensor)
 
     token_ids = tokenizer.encode(
-        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+        f"Give an employee profile that fits this schema: {sample_json_schema}"
+    )
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
     json_LP(token_ids, tensor)
@@ -82,15 +38,15 @@ def test_guided_logits_processors():
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
-async def test_guided_logits_processor_black_box(backend: str):
+async def test_guided_logits_processor_black_box(sample_regex,
+                                                 sample_json_schema,
+                                                 backend: str):
     tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
     token_ids = tokenizer.encode(
-        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
-    regex_request = CompletionRequest(model='test',
-                                      prompt=token_ids,
-                                      guided_regex=TEST_REGEX)
-    regex_lp = await get_guided_decoding_logits_processor(
-        backend, regex_request, tokenizer)
+        f"Give an example IPv4 address with this regex: {sample_regex}")
+    regex_lp = get_guided_decoding_logits_processor(
+        GuidedDecodingFields(guided_regex=sample_regex,
+                             guided_decoding_backend=backend), tokenizer)
     assert regex_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -99,12 +55,11 @@ async def test_guided_logits_processor_black_box(backend: str):
     assert not torch.allclose(tensor, original_tensor)
 
     token_ids = tokenizer.encode(
-        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
-    json_request = CompletionRequest(model='test',
-                                     prompt=token_ids,
-                                     guided_json=TEST_SCHEMA)
-    json_lp = await get_guided_decoding_logits_processor(
-        backend, json_request, tokenizer)
+        f"Give an employee profile that fits this schema: {sample_json_schema}"
+    )
+    json_lp = get_guided_decoding_logits_processor(
+        GuidedDecodingFields(guided_json=sample_json_schema,
+                             guided_decoding_backend=backend), tokenizer)
     assert json_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/test_llm_generate.py
index a00fff91a310e..88477a7cc9832 100644
--- a/tests/entrypoints/test_llm_generate.py
+++ b/tests/entrypoints/test_llm_generate.py
@@ -1,13 +1,18 @@
+import json
+import re
 import weakref
 from typing import List
 
+import jsonschema
 import pytest
 
-from vllm import LLM, RequestOutput, SamplingParams
+from vllm.entrypoints.llm import LLM
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
 
 from ..conftest import cleanup
 
-MODEL_NAME = "facebook/opt-125m"
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 PROMPTS = [
     "Hello, my name is",
@@ -15,7 +20,6 @@
     "The capital of France is",
     "The future of AI is",
 ]
-
 TOKEN_IDS = [
     [0],
     [0, 1],
@@ -30,11 +34,7 @@
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1,
-              gpu_memory_utilization=0.10,
-              enforce_eager=True)
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
@@ -119,6 +119,13 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
 
 @pytest.mark.skip_global_cleanup
 def test_multiple_sampling_params(llm: LLM):
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
     sampling_params = [
         SamplingParams(temperature=0.01, top_p=0.95),
         SamplingParams(temperature=0.3, top_p=0.95),
@@ -140,5 +147,119 @@ def test_multiple_sampling_params(llm: LLM):
     assert len(PROMPTS) == len(outputs)
 
     # sampling_params is None, default params should be applied
-    outputs = llm.generate(PROMPTS, sampling_params=None)
-    assert len(PROMPTS) == len(outputs)
+    outputs = llm.generate(prompts, sampling_params=None)
+    assert len(prompts) == len(outputs)
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_regex(sample_regex, llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+    )
+    outputs = llm.generate(prompts=[
+        f"Give an example IPv4 address with this regex: {sample_regex}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True,
+                           guided_options=dict(guided_regex=sample_regex))
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert re.fullmatch(sample_regex, generated_text) is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_json_completion(sample_json_schema, llm):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+    )
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True,
+                           guided_options=dict(guided_json=sample_json_schema))
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_choice_completion(sample_guided_choice, llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+    )
+    outputs = llm.generate(
+        prompts="The best language for type-safe systems programming is ",
+        sampling_params=sampling_params,
+        use_tqdm=True,
+        guided_options=dict(guided_choice=sample_guided_choice))
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert generated_text in sample_guided_choice
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_grammar(sample_sql_statements, llm):
+
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+    )
+    outputs = llm.generate(
+        prompts=("Generate a sql state that select col_1 from "
+                 "table_1 where it is equals to 1"),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+        guided_options=dict(guided_grammar=sample_sql_statements))
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+
+        # use Lark to parse the output, and make sure it's a valid parse tree
+        from lark import Lark
+        parser = Lark(sample_sql_statements)
+        parser.parse(generated_text)
+
+        # remove spaces for comparison b/c we removed them in the grammar
+        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
+            " ", "")
+
+        assert generated_text.strip() == ground_truth
+
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index c22a675ff1230..698fc09accdc7 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -24,53 +24,6 @@
 # generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-TEST_SCHEMA = {
-    "type": "object",
-    "properties": {
-        "name": {
-            "type": "string"
-        },
-        "age": {
-            "type": "integer"
-        },
-        "skills": {
-            "type": "array",
-            "items": {
-                "type": "string",
-                "maxLength": 10
-            },
-            "minItems": 3
-        },
-        "work history": {
-            "type": "array",
-            "items": {
-                "type": "object",
-                "properties": {
-                    "company": {
-                        "type": "string"
-                    },
-                    "duration": {
-                        "type": "string"
-                    },
-                    "position": {
-                        "type": "string"
-                    }
-                },
-                "required": ["company", "position"]
-            }
-        }
-    },
-    "required": ["name", "age", "skills", "work history"]
-}
-
-TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-TEST_CHOICE = [
-    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
-    "Swift", "Kotlin"
-]
-
 pytestmark = pytest.mark.openai
 
 
@@ -115,7 +68,7 @@ def server(zephyr_lora_files, ray_ctx):
 def client(server):
     return server.get_async_client()
 
-
+@pytest.mark.asyncio
 async def test_check_models(client: openai.AsyncOpenAI):
     models = await client.models.list()
     models = models.data
@@ -127,1086 +80,1102 @@ async def test_check_models(client: openai.AsyncOpenAI):
     assert lora_models[1].id == "zephyr-lora2"
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-
-    choice = completion.choices[0]
-    assert len(choice.text) >= 5
-    assert choice.finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 5
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=None,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=0,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) == 1
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=5,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str):
-
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=21,
-        )
-        ...
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        stream = await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=30,
-            stream=True,
-        )
-        async for chunk in stream:
-            ...
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
-)
-async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=5,
-                                                           temperature=0.0,
-                                                           logprobs=False)
-
-    choice = chat_completion.choices[0]
-    assert choice.logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=5,
-                                                           temperature=0.0,
-                                                           logprobs=True,
-                                                           top_logprobs=0)
-
-    choice = chat_completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.content is not None
-    assert len(choice.logprobs.content[0].top_logprobs) == 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=5,
-                                                           temperature=0.0,
-                                                           logprobs=True,
-                                                           top_logprobs=5)
-
-    choice = chat_completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.content is not None
-    assert len(choice.logprobs.content[0].top_logprobs) == 5
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
-                                      model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    # Default max_logprobs is 20, so this should raise an error
-    with pytest.raises((openai.BadRequestError, openai.APIError)):
-        stream = await client.chat.completions.create(model=model_name,
-                                                      messages=messages,
-                                                      max_tokens=10,
-                                                      logprobs=True,
-                                                      top_logprobs=21,
-                                                      stream=True)
-        async for chunk in stream:
-            ...
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(model=model_name,
-                                             messages=messages,
-                                             max_tokens=10,
-                                             logprobs=True,
-                                             top_logprobs=30,
-                                             stream=False)
-
-    # the server should still work afterwards
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=10,
-                                                           stream=False)
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_single_chat_session(client: openai.AsyncOpenAI,
-                                   model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    # test single completion
-    chat_completion = await client.chat.completions.create(model=model_name,
-                                                           messages=messages,
-                                                           max_tokens=10,
-                                                           logprobs=True,
-                                                           top_logprobs=5)
-    assert chat_completion.id is not None
-    assert len(chat_completion.choices) == 1
-
-    choice = chat_completion.choices[0]
-    assert choice.finish_reason == "length"
-    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=37, total_tokens=47)
-
-    message = choice.message
-    assert message.content is not None and len(message.content) >= 10
-    assert message.role == "assistant"
-    messages.append({"role": "assistant", "content": message.content})
-
-    # test multi-turn dialogue
-    messages.append({"role": "user", "content": "express your result in json"})
-    chat_completion = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-    )
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str):
-    prompt = "What is an LLM?"
-
-    single_completion = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
-    chunks: List[str] = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == "length"
-    assert chunk.choices[0].text
-    assert "".join(chunks) == single_output
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    # test single completion
-    chat_completion = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-    )
-    output = chat_completion.choices[0].message.content
-    stop_reason = chat_completion.choices[0].finish_reason
-
-    # test streaming
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-        stream=True,
-    )
-    chunks: List[str] = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        delta = chunk.choices[0].delta
-        if delta.role:
-            assert delta.role == "assistant"
-        if delta.content:
-            chunks.append(delta.content)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == stop_reason
-    assert delta.content
-    assert "".join(chunks) == output
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
-)
-async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
-                                              model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "What is the capital of France?"
-    }]
-
-    # Test stream=True, stream_options={"include_usage": False}
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": False})
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": True})
-
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-        else:
-            assert chunk.usage is None
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=False, stream_options={"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.chat.completions.create(
-            model=model_name,
-            messages=messages,
-            max_tokens=10,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options={"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.chat.completions.create(
-            model=model_name,
-            messages=messages,
-            max_tokens=10,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": True})
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
-)
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
-    prompt = "What is the capital of France?"
-
-    # Test stream=True, stream_options={"include_usage": False}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": False})
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": True})
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-        else:
-            assert chunk.usage is None
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=False, stream_options={"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options={"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-    # test both text and token IDs
-    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
-        # test simple list
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-        )
-        assert len(batch.choices) == 2
-        assert batch.choices[0].text == batch.choices[1].text
-
-        # test n = 2
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            n=2,
-            max_tokens=5,
-            temperature=0.0,
-            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but not necessary
-                # for official client.
-                use_beam_search=True),
-        )
-        assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
-
-        # test streaming
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-            stream=True,
-        )
-        texts = [""] * 2
-        async for chunk in batch:
-            assert len(chunk.choices) == 1
-            choice = chunk.choices[0]
-            texts[choice.index] += choice.text
-        assert texts[0] == texts[1]
-
-
-@pytest.mark.asyncio
-async def test_logits_bias(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 5
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    token_id = 1000
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token_id): 100},
-        seed=42,
-    )
-    assert len(completion.choices[0].text) >= 5
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
-                                add_special_tokens=False)["input_ids"]
-    assert all([
-        response == expected
-        for response, expected in zip(response_tokens, expected_tokens)
-    ])
-
-    # Test ban
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    first_response = completion.choices[0].text
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token): -100
-                    for token in response_tokens},
-    )
-    assert first_response != completion.choices[0].text
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_json_completion(client: openai.AsyncOpenAI,
-                                      guided_decoding_backend: str):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example JSON for an employee profile "
-        f"that fits this schema: {TEST_SCHEMA}",
-        n=3,
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_json=TEST_SCHEMA,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        output_json = json.loads(completion.choices[i].text)
-        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_json_chat(client: openai.AsyncOpenAI,
-                                guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
-    }]
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=1000,
-        extra_body=dict(guided_json=TEST_SCHEMA,
-                        guided_decoding_backend=guided_decoding_backend))
-    message = chat_completion.choices[0].message
-    assert message.content is not None
-    json1 = json.loads(message.content)
-    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
-
-    messages.append({"role": "assistant", "content": message.content})
-    messages.append({
-        "role":
-        "user",
-        "content":
-        "Give me another one with a different name and age"
-    })
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=1000,
-        extra_body=dict(guided_json=TEST_SCHEMA,
-                        guided_decoding_backend=guided_decoding_backend))
-    message = chat_completion.choices[0].message
-    assert message.content is not None
-    json2 = json.loads(message.content)
-    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
-    assert json1["name"] != json2["name"]
-    assert json1["age"] != json2["age"]
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_completion(client: openai.AsyncOpenAI,
-                                       guided_decoding_backend: str):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
-        n=3,
-        temperature=1.0,
-        max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_chat(client: openai.AsyncOpenAI,
-                                 guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example IP address with this regex: {TEST_REGEX}"
-    }]
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
-                        guided_decoding_backend=guided_decoding_backend))
-    ip1 = chat_completion.choices[0].message.content
-    assert ip1 is not None
-    assert re.fullmatch(TEST_REGEX, ip1) is not None
-
-    messages.append({"role": "assistant", "content": ip1})
-    messages.append({"role": "user", "content": "Give me a different one"})
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=20,
-        extra_body=dict(guided_regex=TEST_REGEX,
-                        guided_decoding_backend=guided_decoding_backend))
-    ip2 = chat_completion.choices[0].message.content
-    assert ip2 is not None
-    assert re.fullmatch(TEST_REGEX, ip2) is not None
-    assert ip1 != ip2
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_completion(client: openai.AsyncOpenAI,
-                                        guided_decoding_backend: str):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt="The best language for type-safe systems programming is ",
-        n=2,
-        temperature=1.0,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 2
-    for i in range(2):
-        assert completion.choices[i].text in TEST_CHOICE
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-    choice1 = chat_completion.choices[0].message.content
-    assert choice1 in TEST_CHOICE
-
-    messages.append({"role": "assistant", "content": choice1})
-    messages.append({
-        "role": "user",
-        "content": "I disagree, pick another one"
-    })
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-    choice2 = chat_completion.choices[0].message.content
-    assert choice2 in TEST_CHOICE
-    assert choice1 != choice2
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str):
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42,
-                            guided_decoding_backend=guided_decoding_backend))
-
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(model=MODEL_NAME,
-                                                 messages=messages,
-                                                 extra_body=dict(guided_regex={
-                                                     1: "Python",
-                                                     2: "C++"
-                                                 }))
-
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        "The best language for type-safe systems programming is "
-    }]
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-        logprobs=True,
-        top_logprobs=5,
-        extra_body=dict(guided_choice=TEST_CHOICE,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert chat_completion.choices[0].logprobs is not None
-    assert chat_completion.choices[0].logprobs.content is not None
-    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
-
-    # -9999.0 is the minimum logprob returned by OpenAI
-    for item in top_logprobs:
-        assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_named_tool_use(client: openai.AsyncOpenAI,
-                              guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
-    }]
-
-    # non-streaming
-
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=1000,
-        tools=[{
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name",
-                "description": "This is a dummy function",
-                "parameters": TEST_SCHEMA
-            }
-        }],
-        tool_choice={
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name"
-            }
-        })
-    message = chat_completion.choices[0].message
-    assert len(message.content) == 0
-    json_string = message.tool_calls[0].function.arguments
-    json1 = json.loads(json_string)
-    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
-
-    messages.append({"role": "assistant", "content": json_string})
-    messages.append({
-        "role":
-        "user",
-        "content":
-        "Give me another one with a different name and age"
-    })
-
-    # streaming
-
-    stream = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=1000,
-        tools=[{
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name",
-                "description": "This is a dummy function",
-                "parameters": TEST_SCHEMA
-            }
-        }],
-        tool_choice={
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name"
-            }
-        },
-        stream=True)
-
-    output = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        delta = chunk.choices[0].delta
-        if delta.role:
-            assert delta.role == "assistant"
-        assert delta.content is None or len(delta.content) == 0
-        if delta.tool_calls:
-            output.append(delta.tool_calls[0].function.arguments)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    json2 = json.loads("".join(output))
-    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
-    assert json1["name"] != json2["name"]
-    assert json1["age"] != json2["age"]
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
-async def test_required_tool_use_not_yet_supported(
-        client: openai.AsyncOpenAI, guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
-    }]
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
-                }
-            }],
-            tool_choice="required")
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
-                }
-            }],
-            tool_choice="auto")
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
-async def test_inconsistent_tool_choice_and_tools(
-        client: openai.AsyncOpenAI, guided_decoding_backend: str):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {TEST_SCHEMA}"
-    }]
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(model=MODEL_NAME,
-                                             messages=messages,
-                                             max_tokens=1000,
-                                             tool_choice={
-                                                 "type": "function",
-                                                 "function": {
-                                                     "name":
-                                                     "dummy_function_name"
-                                                 }
-                                             })
-
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": TEST_SCHEMA
-                }
-            }],
-            tool_choice={
-                "type": "function",
-                "function": {
-                    "name": "nondefined_function_name"
-                }
-            })
-
-
-@pytest.mark.asyncio
-async def test_response_format_json_object(client: openai.AsyncOpenAI):
-    for _ in range(2):
-        resp = await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=[{
-                "role":
-                "user",
-                "content": ('what is 1+1? please respond with a JSON object, '
-                            'the format is {"result": 2}')
-            }],
-            response_format={"type": "json_object"})
-
-        content = resp.choices[0].message.content
-        assert content is not None
-
-        loaded = json.loads(content)
-        assert loaded == {"result": 2}, loaded
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     # first test base model, then test loras
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+# )
+# async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+#     completion = await client.completions.create(model=model_name,
+#                                                  prompt="Hello, my name is",
+#                                                  max_tokens=5,
+#                                                  temperature=0.0)
+
+#     assert completion.id is not None
+#     assert completion.choices is not None and len(completion.choices) == 1
+
+#     choice = completion.choices[0]
+#     assert len(choice.text) >= 5
+#     assert choice.finish_reason == "length"
+#     assert completion.usage == openai.types.CompletionUsage(
+#         completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+#     # test using token IDs
+#     completion = await client.completions.create(
+#         model=MODEL_NAME,
+#         prompt=[0, 0, 0, 0, 0],
+#         max_tokens=5,
+#         temperature=0.0,
+#     )
+#     assert len(completion.choices[0].text) >= 5
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     # first test base model, then test loras
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+# )
+# async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+#     # test using token IDs
+#     completion = await client.completions.create(
+#         model=MODEL_NAME,
+#         prompt=[0, 0, 0, 0, 0],
+#         max_tokens=5,
+#         temperature=0.0,
+#         logprobs=None,
+#     )
+#     choice = completion.choices[0]
+#     assert choice.logprobs is None
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     # just test 1 lora hereafter
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora"],
+# )
+# async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+#     # test using token IDs
+#     completion = await client.completions.create(
+#         model=MODEL_NAME,
+#         prompt=[0, 0, 0, 0, 0],
+#         max_tokens=5,
+#         temperature=0.0,
+#         logprobs=0,
+#     )
+#     choice = completion.choices[0]
+#     assert choice.logprobs is not None
+#     assert choice.logprobs.token_logprobs is not None
+#     assert choice.logprobs.top_logprobs is not None
+#     assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora"],
+# )
+# async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+#     # test using token IDs
+#     completion = await client.completions.create(
+#         model=MODEL_NAME,
+#         prompt=[0, 0, 0, 0, 0],
+#         max_tokens=5,
+#         temperature=0.0,
+#         logprobs=5,
+#     )
+#     choice = completion.choices[0]
+#     assert choice.logprobs is not None
+#     assert choice.logprobs.token_logprobs is not None
+#     assert choice.logprobs.top_logprobs is not None
+#     assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora"],
+# )
+# async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+#                                             model_name: str):
+
+#     with pytest.raises(
+#         (openai.BadRequestError, openai.APIError)):  # test using token IDs
+#         await client.completions.create(
+#             model=MODEL_NAME,
+#             prompt=[0, 0, 0, 0, 0],
+#             max_tokens=5,
+#             temperature=0.0,
+#             # vLLM has higher default max_logprobs (20 instead of 5) to support
+#             # both Completion API and Chat Completion API
+#             logprobs=21,
+#         )
+#         ...
+#     with pytest.raises(
+#         (openai.BadRequestError, openai.APIError)):  # test using token IDs
+#         stream = await client.completions.create(
+#             model=MODEL_NAME,
+#             prompt=[0, 0, 0, 0, 0],
+#             max_tokens=5,
+#             temperature=0.0,
+#             # vLLM has higher default max_logprobs (20 instead of 5) to support
+#             # both Completion API and Chat Completion API
+#             logprobs=30,
+#             stream=True,
+#         )
+#         async for chunk in stream:
+#             ...
+
+#     # the server should still work afterwards
+#     completion = await client.completions.create(
+#         model=model_name,
+#         prompt=[0, 0, 0, 0, 0],
+#         max_tokens=5,
+#         temperature=0.0,
+#     )
+#     assert len(completion.choices[0].text) >= 0
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     # first test base model, then test loras
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+# )
+# async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role": "user",
+#         "content": "what is 1+1?"
+#     }]
+
+#     chat_completion = await client.chat.completions.create(model=model_name,
+#                                                            messages=messages,
+#                                                            max_tokens=5,
+#                                                            temperature=0.0,
+#                                                            logprobs=False)
+
+#     choice = chat_completion.choices[0]
+#     assert choice.logprobs is None
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     # just test 1 lora hereafter
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora"],
+# )
+# async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role": "user",
+#         "content": "what is 1+1?"
+#     }]
+
+#     chat_completion = await client.chat.completions.create(model=model_name,
+#                                                            messages=messages,
+#                                                            max_tokens=5,
+#                                                            temperature=0.0,
+#                                                            logprobs=True,
+#                                                            top_logprobs=0)
+
+#     choice = chat_completion.choices[0]
+#     assert choice.logprobs is not None
+#     assert choice.logprobs.content is not None
+#     assert len(choice.logprobs.content[0].top_logprobs) == 0
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora"],
+# )
+# async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role": "user",
+#         "content": "what is 1+1?"
+#     }]
+
+#     chat_completion = await client.chat.completions.create(model=model_name,
+#                                                            messages=messages,
+#                                                            max_tokens=5,
+#                                                            temperature=0.0,
+#                                                            logprobs=True,
+#                                                            top_logprobs=5)
+
+#     choice = chat_completion.choices[0]
+#     assert choice.logprobs is not None
+#     assert choice.logprobs.content is not None
+#     assert len(choice.logprobs.content[0].top_logprobs) == 5
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora"],
+# )
+# async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
+#                                       model_name: str):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role": "user",
+#         "content": "what is 1+1?"
+#     }]
+
+#     # Default max_logprobs is 20, so this should raise an error
+#     with pytest.raises((openai.BadRequestError, openai.APIError)):
+#         stream = await client.chat.completions.create(model=model_name,
+#                                                       messages=messages,
+#                                                       max_tokens=10,
+#                                                       logprobs=True,
+#                                                       top_logprobs=21,
+#                                                       stream=True)
+#         async for chunk in stream:
+#             ...
+
+#     with pytest.raises(openai.BadRequestError):
+#         await client.chat.completions.create(model=model_name,
+#                                              messages=messages,
+#                                              max_tokens=10,
+#                                              logprobs=True,
+#                                              top_logprobs=30,
+#                                              stream=False)
+
+#     # the server should still work afterwards
+#     chat_completion = await client.chat.completions.create(model=model_name,
+#                                                            messages=messages,
+#                                                            max_tokens=10,
+#                                                            stream=False)
+#     message = chat_completion.choices[0].message
+#     assert message.content is not None and len(message.content) >= 0
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora"],
+# )
+# async def test_single_chat_session(client: openai.AsyncOpenAI,
+#                                    model_name: str):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role": "user",
+#         "content": "what is 1+1?"
+#     }]
+
+#     # test single completion
+#     chat_completion = await client.chat.completions.create(model=model_name,
+#                                                            messages=messages,
+#                                                            max_tokens=10,
+#                                                            logprobs=True,
+#                                                            top_logprobs=5)
+#     assert chat_completion.id is not None
+#     assert len(chat_completion.choices) == 1
+
+#     choice = chat_completion.choices[0]
+#     assert choice.finish_reason == "length"
+#     assert chat_completion.usage == openai.types.CompletionUsage(
+#         completion_tokens=10, prompt_tokens=37, total_tokens=47)
+
+#     message = choice.message
+#     assert message.content is not None and len(message.content) >= 10
+#     assert message.role == "assistant"
+#     messages.append({"role": "assistant", "content": message.content})
+
+#     # test multi-turn dialogue
+#     messages.append({"role": "user", "content": "express your result in json"})
+#     chat_completion = await client.chat.completions.create(
+#         model=model_name,
+#         messages=messages,
+#         max_tokens=10,
+#     )
+#     message = chat_completion.choices[0].message
+#     assert message.content is not None and len(message.content) >= 0
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora"],
+# )
+# async def test_completion_streaming(client: openai.AsyncOpenAI,
+#                                     model_name: str):
+#     prompt = "What is an LLM?"
+
+#     single_completion = await client.completions.create(
+#         model=model_name,
+#         prompt=prompt,
+#         max_tokens=5,
+#         temperature=0.0,
+#     )
+#     single_output = single_completion.choices[0].text
+#     stream = await client.completions.create(model=model_name,
+#                                              prompt=prompt,
+#                                              max_tokens=5,
+#                                              temperature=0.0,
+#                                              stream=True)
+#     chunks: List[str] = []
+#     finish_reason_count = 0
+#     async for chunk in stream:
+#         chunks.append(chunk.choices[0].text)
+#         if chunk.choices[0].finish_reason is not None:
+#             finish_reason_count += 1
+#     # finish reason should only return in last block
+#     assert finish_reason_count == 1
+#     assert chunk.choices[0].finish_reason == "length"
+#     assert chunk.choices[0].text
+#     assert "".join(chunks) == single_output
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     # just test 1 lora hereafter
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora"],
+# )
+# async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role": "user",
+#         "content": "what is 1+1?"
+#     }]
+
+#     # test single completion
+#     chat_completion = await client.chat.completions.create(
+#         model=model_name,
+#         messages=messages,
+#         max_tokens=10,
+#         temperature=0.0,
+#     )
+#     output = chat_completion.choices[0].message.content
+#     stop_reason = chat_completion.choices[0].finish_reason
+
+#     # test streaming
+#     stream = await client.chat.completions.create(
+#         model=model_name,
+#         messages=messages,
+#         max_tokens=10,
+#         temperature=0.0,
+#         stream=True,
+#     )
+#     chunks: List[str] = []
+#     finish_reason_count = 0
+#     async for chunk in stream:
+#         delta = chunk.choices[0].delta
+#         if delta.role:
+#             assert delta.role == "assistant"
+#         if delta.content:
+#             chunks.append(delta.content)
+#         if chunk.choices[0].finish_reason is not None:
+#             finish_reason_count += 1
+#     # finish reason should only return in last block
+#     assert finish_reason_count == 1
+#     assert chunk.choices[0].finish_reason == stop_reason
+#     assert delta.content
+#     assert "".join(chunks) == output
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     "model_name",
+#     ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+# )
+# async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
+#                                               model_name: str):
+#     messages = [{
+#         "role": "system",
+#         "content": "You are a helpful assistant."
+#     }, {
+#         "role": "user",
+#         "content": "What is the capital of France?"
+#     }]
+
+#     # Test stream=True, stream_options={"include_usage": False}
+#     stream = await client.chat.completions.create(
+#         model=model_name,
+#         messages=messages,
+#         max_tokens=10,
+#         temperature=0.0,
+#         stream=True,
+#         stream_options={"include_usage": False})
+#     async for chunk in stream:
+#         assert chunk.usage is None
+
+#     # Test stream=True, stream_options={"include_usage": True}
+#     stream = await client.chat.completions.create(
+#         model=model_name,
+#         messages=messages,
+#         max_tokens=10,
+#         temperature=0.0,
+#         stream=True,
+#         stream_options={"include_usage": True})
+
+#     async for chunk in stream:
+#         if chunk.choices[0].finish_reason is None:
+#             assert chunk.usage is None
+#         else:
+#             assert chunk.usage is None
+#             final_chunk = await stream.__anext__()
+#             assert final_chunk.usage is not None
+#             assert final_chunk.usage.prompt_tokens > 0
+#             assert final_chunk.usage.completion_tokens > 0
+#             assert final_chunk.usage.total_tokens == (
+#                 final_chunk.usage.prompt_tokens +
+#                 final_chunk.usage.completion_tokens)
+#             assert final_chunk.choices == []
+
+#     # Test stream=False, stream_options={"include_usage": None}
+#     with pytest.raises(BadRequestError):
+#         await client.chat.completions.create(
+#             model=model_name,
+#             messages=messages,
+#             max_tokens=10,
+#             temperature=0.0,
+#             stream=False,
+#             stream_options={"include_usage": None})
+
+#     # Test stream=False, stream_options={"include_usage": True}
+#     with pytest.raises(BadRequestError):
+#         await client.chat.completions.create(
+#             model=model_name,
+#             messages=messages,
+#             max_tokens=10,
+#             temperature=0.0,
+#             stream=False,
+#             stream_options={"include_usage": True})
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     "model_name",
+#     ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+# )
+# async def test_completion_stream_options(client: openai.AsyncOpenAI,
+#                                          model_name: str):
+#     prompt = "What is the capital of France?"
+
+#     # Test stream=True, stream_options={"include_usage": False}
+#     stream = await client.completions.create(
+#         model=model_name,
+#         prompt=prompt,
+#         max_tokens=5,
+#         temperature=0.0,
+#         stream=True,
+#         stream_options={"include_usage": False})
+#     async for chunk in stream:
+#         assert chunk.usage is None
+
+#     # Test stream=True, stream_options={"include_usage": True}
+#     stream = await client.completions.create(
+#         model=model_name,
+#         prompt=prompt,
+#         max_tokens=5,
+#         temperature=0.0,
+#         stream=True,
+#         stream_options={"include_usage": True})
+#     async for chunk in stream:
+#         if chunk.choices[0].finish_reason is None:
+#             assert chunk.usage is None
+#         else:
+#             assert chunk.usage is None
+#             final_chunk = await stream.__anext__()
+#             assert final_chunk.usage is not None
+#             assert final_chunk.usage.prompt_tokens > 0
+#             assert final_chunk.usage.completion_tokens > 0
+#             assert final_chunk.usage.total_tokens == (
+#                 final_chunk.usage.prompt_tokens +
+#                 final_chunk.usage.completion_tokens)
+#             assert final_chunk.choices == []
+
+#     # Test stream=False, stream_options={"include_usage": None}
+#     with pytest.raises(BadRequestError):
+#         await client.completions.create(model=model_name,
+#                                         prompt=prompt,
+#                                         max_tokens=5,
+#                                         temperature=0.0,
+#                                         stream=False,
+#                                         stream_options={"include_usage": None})
+
+#     # Test stream=False, stream_options={"include_usage": True}
+#     with pytest.raises(BadRequestError):
+#         await client.completions.create(model=model_name,
+#                                         prompt=prompt,
+#                                         max_tokens=5,
+#                                         temperature=0.0,
+#                                         stream=False,
+#                                         stream_options={"include_usage": True})
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize(
+#     # just test 1 lora hereafter
+#     "model_name",
+#     [MODEL_NAME, "zephyr-lora"],
+# )
+# async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+#     # test both text and token IDs
+#     for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+#         # test simple list
+#         batch = await client.completions.create(
+#             model=model_name,
+#             prompt=prompts,
+#             max_tokens=5,
+#             temperature=0.0,
+#         )
+#         assert len(batch.choices) == 2
+#         assert batch.choices[0].text == batch.choices[1].text
+
+#         # test n = 2
+#         batch = await client.completions.create(
+#             model=model_name,
+#             prompt=prompts,
+#             n=2,
+#             max_tokens=5,
+#             temperature=0.0,
+#             extra_body=dict(
+#                 # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+#                 # for official client.
+#                 use_beam_search=True),
+#         )
+#         assert len(batch.choices) == 4
+#         assert batch.choices[0].text != batch.choices[
+#             1].text, "beam search should be different"
+#         assert batch.choices[0].text == batch.choices[
+#             2].text, "two copies of the same prompt should be the same"
+#         assert batch.choices[1].text == batch.choices[
+#             3].text, "two copies of the same prompt should be the same"
+
+#         # test streaming
+#         batch = await client.completions.create(
+#             model=model_name,
+#             prompt=prompts,
+#             max_tokens=5,
+#             temperature=0.0,
+#             stream=True,
+#         )
+#         texts = [""] * 2
+#         async for chunk in batch:
+#             assert len(chunk.choices) == 1
+#             choice = chunk.choices[0]
+#             texts[choice.index] += choice.text
+#         assert texts[0] == texts[1]
+
+
+# @pytest.mark.asyncio
+# async def test_logits_bias(client: openai.AsyncOpenAI):
+#     prompt = "Hello, my name is"
+#     max_tokens = 5
+#     tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+#     # Test exclusive selection
+#     token_id = 1000
+#     completion = await client.completions.create(
+#         model=MODEL_NAME,
+#         prompt=prompt,
+#         max_tokens=max_tokens,
+#         temperature=0.0,
+#         logit_bias={str(token_id): 100},
+#         seed=42,
+#     )
+#     assert len(completion.choices[0].text) >= 5
+#     response_tokens = tokenizer(completion.choices[0].text,
+#                                 add_special_tokens=False)["input_ids"]
+#     expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+#                                 add_special_tokens=False)["input_ids"]
+#     assert all([
+#         response == expected
+#         for response, expected in zip(response_tokens, expected_tokens)
+#     ])
+
+#     # Test ban
+#     completion = await client.completions.create(
+#         model=MODEL_NAME,
+#         prompt=prompt,
+#         max_tokens=max_tokens,
+#         temperature=0.0,
+#     )
+#     response_tokens = tokenizer(completion.choices[0].text,
+#                                 add_special_tokens=False)["input_ids"]
+#     first_response = completion.choices[0].text
+#     completion = await client.completions.create(
+#         model=MODEL_NAME,
+#         prompt=prompt,
+#         max_tokens=max_tokens,
+#         temperature=0.0,
+#         logit_bias={str(token): -100
+#                     for token in response_tokens},
+#     )
+#     assert first_response != completion.choices[0].text
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize("guided_decoding_backend",
+#                          ["outlines", "lm-format-enforcer"])
+# async def test_guided_json_completion(server, sample_json_schema,
+#                                       client: openai.AsyncOpenAI,
+#                                       guided_decoding_backend: str):
+#     completion = await client.completions.create(
+#         model=MODEL_NAME,
+#         prompt=f"Give an example JSON for an employee profile "
+#         f"that fits this schema: {sample_json_schema}",
+#         n=3,
+#         temperature=1.0,
+#         max_tokens=500,
+#         extra_body=dict(guided_json=sample_json_schema))
+
+#     assert completion.id is not None
+#     assert len(completion.choices) == 3
+#     for i in range(3):
+#         output_json = json.loads(completion.choices[i].text)
+#         jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize("guided_decoding_backend",
+#                          ["outlines", "lm-format-enforcer"])
+# async def test_guided_json_chat(server, sample_json_schema,
+#                                 client: openai.AsyncOpenAI,
+#                                 guided_decoding_backend: str):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role":
+#         "user",
+#         "content":
+#         f"Give an example JSON for an employee profile that "
+#         f"fits this schema: {sample_json_schema}"
+#     }]
+#     chat_completion = await client.chat.completions.create(
+#         model=MODEL_NAME,
+#         messages=messages,
+#         max_tokens=500,
+#         extra_body=dict(guided_json=sample_json_schema,
+#                         guided_decoding_backend=guided_decoding_backend))
+#     message = chat_completion.choices[0].message
+#     assert message.content is not None
+#     json1 = json.loads(message.content)
+#     jsonschema.validate(instance=json1, schema=sample_json_schema)
+
+#     messages.append({"role": "assistant", "content": message.content})
+#     messages.append({
+#         "role":
+#         "user",
+#         "content":
+#         "Give me another one with a different name and age"
+#     })
+#     chat_completion = await client.chat.completions.create(
+#         model=MODEL_NAME,
+#         messages=messages,
+#         max_tokens=500,
+#         extra_body=dict(guided_json=sample_json_schema,
+#                         guided_decoding_backend=guided_decoding_backend))
+#     message = chat_completion.choices[0].message
+#     assert message.content is not None
+#     json2 = json.loads(message.content)
+#     jsonschema.validate(instance=json2, schema=sample_json_schema)
+#     assert json1["name"] != json2["name"]
+#     assert json1["age"] != json2["age"]
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize("guided_decoding_backend",
+#                          ["outlines", "lm-format-enforcer"])
+# async def test_guided_regex_completion(server, sample_regex,
+#                                        client: openai.AsyncOpenAI,
+#                                        guided_decoding_backend: str):
+#     completion = await client.completions.create(
+#         model=MODEL_NAME,
+#         prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
+#         n=3,
+#         temperature=1.0,
+#         max_tokens=20,
+#         extra_body=dict(guided_regex=sample_regex,
+#                         guided_decoding_backend=guided_decoding_backend))
+
+#     assert completion.id is not None
+#     assert len(completion.choices) == 3
+#     for i in range(3):
+#         assert completion.choices[i].text is not None
+#         assert re.fullmatch(sample_regex,
+#                             completion.choices[i].text) is not None
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize("guided_decoding_backend",
+#                          ["outlines", "lm-format-enforcer"])
+# async def test_guided_regex_chat(server, sample_regex,
+#                                  client: openai.AsyncOpenAI,
+#                                  guided_decoding_backend: str):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role":
+#         "user",
+#         "content":
+#         f"Give an example IP address with this regex: {sample_regex}"
+#     }]
+#     chat_completion = await client.chat.completions.create(
+#         model=MODEL_NAME,
+#         messages=messages,
+#         max_tokens=20,
+#         extra_body=dict(guided_regex=sample_regex,
+#                         guided_decoding_backend=guided_decoding_backend))
+#     ip1 = chat_completion.choices[0].message.content
+#     assert ip1 is not None
+#     assert re.fullmatch(sample_regex, ip1) is not None
+
+#     messages.append({"role": "assistant", "content": ip1})
+#     messages.append({"role": "user", "content": "Give me a different one"})
+#     chat_completion = await client.chat.completions.create(
+#         model=MODEL_NAME,
+#         messages=messages,
+#         max_tokens=20,
+#         extra_body=dict(guided_regex=sample_regex,
+#                         guided_decoding_backend=guided_decoding_backend))
+#     ip2 = chat_completion.choices[0].message.content
+#     assert ip2 is not None
+#     assert re.fullmatch(sample_regex, ip2) is not None
+#     assert ip1 != ip2
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize("guided_decoding_backend",
+#                          ["outlines", "lm-format-enforcer"])
+# async def test_guided_choice_completion(server, sample_guided_choice,
+#                                         client: openai.AsyncOpenAI,
+#                                         guided_decoding_backend: str):
+#     completion = await client.completions.create(
+#         model=MODEL_NAME,
+#         prompt="The best language for type-safe systems programming is ",
+#         n=2,
+#         temperature=1.0,
+#         max_tokens=10,
+#         extra_body=dict(guided_choice=sample_guided_choice,
+#                         guided_decoding_backend=guided_decoding_backend))
+
+#     assert completion.id is not None
+#     assert len(completion.choices) == 2
+#     for i in range(2):
+#         assert completion.choices[i].text in sample_guided_choice
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize("guided_decoding_backend",
+#                          ["outlines", "lm-format-enforcer"])
+# async def test_guided_choice_chat(server, sample_guided_choice,
+#                                   client: openai.AsyncOpenAI,
+#                                   guided_decoding_backend: str):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role":
+#         "user",
+#         "content":
+#         "The best language for type-safe systems programming is "
+#     }]
+#     chat_completion = await client.chat.completions.create(
+#         model=MODEL_NAME,
+#         messages=messages,
+#         max_tokens=10,
+#         extra_body=dict(guided_choice=sample_guided_choice,
+#                         guided_decoding_backend=guided_decoding_backend))
+#     choice1 = chat_completion.choices[0].message.content
+#     assert choice1 in sample_guided_choice
+
+#     messages.append({"role": "assistant", "content": choice1})
+#     messages.append({
+#         "role": "user",
+#         "content": "I disagree, pick another one"
+#     })
+#     chat_completion = await client.chat.completions.create(
+#         model=MODEL_NAME,
+#         messages=messages,
+#         max_tokens=10,
+#         extra_body=dict(guided_choice=sample_guided_choice,
+#                         guided_decoding_backend=guided_decoding_backend))
+#     choice2 = chat_completion.choices[0].message.content
+#     assert choice2 in sample_guided_choice
+#     assert choice1 != choice2
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize("guided_decoding_backend",
+#                          ["outlines", "lm-format-enforcer"])
+# async def test_guided_decoding_type_error(server, sample_regex,
+#                                           sample_json_schema,
+#                                           client: openai.AsyncOpenAI,
+#                                           guided_decoding_backend: str):
+#     with pytest.raises(openai.BadRequestError):
+#         _ = await client.completions.create(
+#             model=MODEL_NAME,
+#             prompt="Give an example JSON that fits this schema: 42",
+#             extra_body=dict(guided_json=42,
+#                             guided_decoding_backend=guided_decoding_backend))
+
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role":
+#         "user",
+#         "content":
+#         "The best language for type-safe systems programming is "
+#     }]
+#     with pytest.raises(openai.BadRequestError):
+#         _ = await client.chat.completions.create(model=MODEL_NAME,
+#                                                  messages=messages,
+#                                                  extra_body=dict(guided_regex={
+#                                                      1: "Python",
+#                                                      2: "C++"
+#                                                  }))
+
+#     with pytest.raises(openai.BadRequestError):
+#         _ = await client.completions.create(
+#             model=MODEL_NAME,
+#             prompt="Give an example string that fits this regex",
+#             extra_body=dict(guided_regex=sample_regex,
+#                             guided_json=sample_json_schema))
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize("guided_decoding_backend",
+#                          ["outlines", "lm-format-enforcer"])
+# async def test_guided_choice_chat_logprobs(server, sample_guided_choice,
+#                                            client: openai.AsyncOpenAI,
+#                                            guided_decoding_backend: str):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role":
+#         "user",
+#         "content":
+#         "The best language for type-safe systems programming is "
+#     }]
+#     chat_completion = await client.chat.completions.create(
+#         model=MODEL_NAME,
+#         messages=messages,
+#         max_tokens=10,
+#         logprobs=True,
+#         top_logprobs=5,
+#         extra_body=dict(guided_choice=sample_guided_choice,
+#                         guided_decoding_backend=guided_decoding_backend))
+
+#     assert chat_completion.choices[0].logprobs is not None
+#     assert chat_completion.choices[0].logprobs.content is not None
+#     top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+#     # -9999.0 is the minimum logprob returned by OpenAI
+#     for item in top_logprobs:
+#         assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize("guided_decoding_backend",
+#                          ["outlines", "lm-format-enforcer"])
+# async def test_named_tool_use(client: openai.AsyncOpenAI,
+#                               guided_decoding_backend: str,
+#                               sample_json_schema):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role":
+#         "user",
+#         "content":
+#         f"Give an example JSON for an employee profile that "
+#         f"fits this schema: {sample_json_schema}"
+#     }]
+
+#     # non-streaming
+
+#     chat_completion = await client.chat.completions.create(
+#         model=MODEL_NAME,
+#         messages=messages,
+#         max_tokens=1000,
+#         tools=[{
+#             "type": "function",
+#             "function": {
+#                 "name": "dummy_function_name",
+#                 "description": "This is a dummy function",
+#                 "parameters": sample_json_schema
+#             }
+#         }],
+#         tool_choice={
+#             "type": "function",
+#             "function": {
+#                 "name": "dummy_function_name"
+#             }
+#         })
+#     message = chat_completion.choices[0].message
+#     assert len(message.content) == 0
+#     json_string = message.tool_calls[0].function.arguments
+#     json1 = json.loads(json_string)
+#     jsonschema.validate(instance=json1, schema=sample_json_schema)
+
+#     messages.append({"role": "assistant", "content": json_string})
+#     messages.append({
+#         "role":
+#         "user",
+#         "content":
+#         "Give me another one with a different name and age"
+#     })
+
+#     # streaming
+
+#     stream = await client.chat.completions.create(
+#         model=MODEL_NAME,
+#         messages=messages,
+#         max_tokens=1000,
+#         tools=[{
+#             "type": "function",
+#             "function": {
+#                 "name": "dummy_function_name",
+#                 "description": "This is a dummy function",
+#                 "parameters": sample_json_schema
+#             }
+#         }],
+#         tool_choice={
+#             "type": "function",
+#             "function": {
+#                 "name": "dummy_function_name"
+#             }
+#         },
+#         stream=True)
+
+#     output = []
+#     finish_reason_count = 0
+#     async for chunk in stream:
+#         delta = chunk.choices[0].delta
+#         if delta.role:
+#             assert delta.role == "assistant"
+#         assert delta.content is None or len(delta.content) == 0
+#         if delta.tool_calls:
+#             output.append(delta.tool_calls[0].function.arguments)
+#         if chunk.choices[0].finish_reason is not None:
+#             finish_reason_count += 1
+#     # finish reason should only return in last block
+#     assert finish_reason_count == 1
+#     json2 = json.loads("".join(output))
+#     jsonschema.validate(instance=json2, schema=sample_json_schema)
+#     assert json1["name"] != json2["name"]
+#     assert json1["age"] != json2["age"]
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
+# async def test_required_tool_use_not_yet_supported(
+#         client: openai.AsyncOpenAI, guided_decoding_backend: str,
+#         sample_json_schema):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role":
+#         "user",
+#         "content":
+#         f"Give an example JSON for an employee profile that "
+#         f"fits this schema: {sample_json_schema}"
+#     }]
+
+#     with pytest.raises(openai.BadRequestError):
+#         await client.chat.completions.create(
+#             model=MODEL_NAME,
+#             messages=messages,
+#             max_tokens=1000,
+#             tools=[{
+#                 "type": "function",
+#                 "function": {
+#                     "name": "dummy_function_name",
+#                     "description": "This is a dummy function",
+#                     "parameters": sample_json_schema
+#                 }
+#             }],
+#             tool_choice="required")
+
+#     with pytest.raises(openai.BadRequestError):
+#         await client.chat.completions.create(
+#             model=MODEL_NAME,
+#             messages=messages,
+#             max_tokens=1000,
+#             tools=[{
+#                 "type": "function",
+#                 "function": {
+#                     "name": "dummy_function_name",
+#                     "description": "This is a dummy function",
+#                     "parameters": sample_json_schema
+#                 }
+#             }],
+#             tool_choice="auto")
+
+
+# @pytest.mark.asyncio
+# @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
+# async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
+#                                                   guided_decoding_backend: str,
+#                                                   sample_json_schema):
+#     messages = [{
+#         "role": "system",
+#         "content": "you are a helpful assistant"
+#     }, {
+#         "role":
+#         "user",
+#         "content":
+#         f"Give an example JSON for an employee profile that "
+#         f"fits this schema: {sample_json_schema}"
+#     }]
+
+#     with pytest.raises(openai.BadRequestError):
+#         await client.chat.completions.create(model=MODEL_NAME,
+#                                              messages=messages,
+#                                              max_tokens=1000,
+#                                              tool_choice={
+#                                                  "type": "function",
+#                                                  "function": {
+#                                                      "name":
+#                                                      "dummy_function_name"
+#                                                  }
+#                                              })
+
+#     with pytest.raises(openai.BadRequestError):
+#         await client.chat.completions.create(
+#             model=MODEL_NAME,
+#             messages=messages,
+#             max_tokens=1000,
+#             tools=[{
+#                 "type": "function",
+#                 "function": {
+#                     "name": "dummy_function_name",
+#                     "description": "This is a dummy function",
+#                     "parameters": sample_json_schema
+#                 }
+#             }],
+#             tool_choice={
+#                 "type": "function",
+#                 "function": {
+#                     "name": "nondefined_function_name"
+#                 }
+#             })
+
+
+# @pytest.mark.asyncio
+# async def test_response_format_json_object(client: openai.AsyncOpenAI):
+#     for _ in range(2):
+#         result_format = {"result": "2"}
+#         resp = await client.chat.completions.create(
+#             model=MODEL_NAME,
+#             messages=[{
+#                 "role":
+#                 "user",
+#                 "content": 'what is 1+1? please respond with a JSON object with the format: {"result": "integer"}'
+#             }],
+#             response_format={"type": "json_object"})
+
+#         content = resp.choices[0].message.content
+#         print(content)
+#         assert content is not None
+
+#         loaded = json.loads(content)
+#         print(loaded)
+#         assert loaded == {"result": 2}, loaded
 
 
 @pytest.mark.asyncio
@@ -1275,34 +1244,22 @@ async def test_custom_role(client: openai.AsyncOpenAI):
     content2 = resp2.choices[0].message.content
     assert content1 == content2
 
-
 @pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI):
-    simple_sql_grammar = """
-start: select_statement
-
-select_statement: "SELECT" column "from" table "where" condition
-
-column: "col_1" | "col_2"
-table: "table_1" | "table_2"
-condition: column "=" number
-
-number: "1" | "2"
-"""
-
+async def test_guided_grammar(server, sample_sql_statements,
+                              client: openai.AsyncOpenAI):
     completion = await client.completions.create(
         model=MODEL_NAME,
         prompt=("Generate a sql state that select col_1 from "
                 "table_1 where it is equals to 1"),
         temperature=1.0,
         max_tokens=500,
-        extra_body=dict(guided_grammar=simple_sql_grammar))
+        extra_body=dict(guided_grammar=sample_sql_statements))
 
     content = completion.choices[0].text
 
     # use Lark to parse the output, and make sure it's a valid parse tree
     from lark import Lark
-    parser = Lark(simple_sql_grammar)
+    parser = Lark(sample_sql_statements)
     parser.parse(content)
 
     # remove spaces for comparison b/c we removed them in the grammar
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 8759ee06795b8..73a27d2e00cf3 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import enum
 from abc import ABC, abstractmethod
 from typing import List
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 9e923493160ed..987e21cc2b5a3 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1,5 +1,6 @@
 from contextlib import contextmanager
-from typing import ClassVar, List, Optional, Sequence, Union, cast, overload
+from typing import (ClassVar, Dict, List, Optional, Sequence, Union, cast,
+                    overload)
 
 from tqdm import tqdm
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -11,6 +12,8 @@
                          parse_and_batch_prompt)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.model_executor.guided_decoding import (
+    GuidedDecodingFields, get_guided_decoding_logits_processor)
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
@@ -250,6 +253,7 @@ def generate(
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        guided_options: Optional[Union[Dict, "GuidedDecodingFields"]] = None
     ) -> List[RequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -260,9 +264,9 @@ def generate(
         Args:
             inputs: A list of inputs to generate completions for.
             sampling_params: The sampling parameters for text generation. If
-                None, we use the default sampling parameters. 
-                When it is a single value, it is applied to every prompt. 
-                When it is a list, the list must have the same length as the 
+                None, we use the default sampling parameters.
+                When it is a single value, it is applied to every prompt.
+                When it is a list, the list must have the same length as the
                 prompts and it is paired one by one with the prompt.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
@@ -291,15 +295,18 @@ def generate(
                 Union[PromptStrictInputs, Sequence[PromptStrictInputs]],
                 prompts)
 
+        if isinstance(guided_options, Dict) and len(guided_options) > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding but multiple is "
+                f"specified: {self.__dict__}")
         if sampling_params is None:
             # Use default sampling params.
             sampling_params = SamplingParams()
 
-        self._validate_and_add_requests(
-            inputs=inputs,
-            params=sampling_params,
-            lora_request=lora_request,
-        )
+        self._validate_and_add_requests(inputs=inputs,
+                                        params=sampling_params,
+                                        lora_request=lora_request,
+                                        guided_options=guided_options)
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
         return LLMEngine.validate_outputs(outputs, RequestOutput)
@@ -499,22 +506,33 @@ def _validate_and_add_requests(
         params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                       Sequence[PoolingParams]],
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
+        guided_options: Optional[Union[Dict, "GuidedDecodingFields"]] = None
     ) -> None:
+
         if isinstance(inputs, (str, dict)):
             # Convert a single prompt to a list.
             inputs = [inputs]
 
         num_requests = len(inputs)
-
-        if isinstance(params, list) and len(params) != num_requests:
-            raise ValueError("The lengths of prompts and params "
-                             "must be the same.")
         if isinstance(lora_request,
                       list) and len(lora_request) != num_requests:
             raise ValueError("The lengths of prompts and lora_request "
                              "must be the same.")
+        if params is None:
+            # Use default sampling params.
+            params = [SamplingParams()] * num_requests
+        elif isinstance(params, list):
+            if len(params) != num_requests:
+                raise ValueError("The lengths of prompts and params "
+                                 "must be the same.")
+
+            params = [
+                self._add_guided_processor(param, guided_options)
+                for param in params if isinstance(param, SamplingParams)
+            ]
+        elif isinstance(params, SamplingParams):
+            params = self._add_guided_processor(params, guided_options)
 
-        # Add requests to the engine.
         for i, request_inputs in enumerate(inputs):
             self._add_request(
                 request_inputs,
@@ -523,6 +541,42 @@ def _validate_and_add_requests(
                     lora_request, Sequence) else lora_request,
             )
 
+    def _add_guided_processor(
+            self,
+            params: SamplingParams,
+            guided_options: Optional[Union[Dict,
+                                           "GuidedDecodingFields"]] = None):
+        if guided_options:
+            if isinstance(guided_options, dict):
+                guided_options = GuidedDecodingFields(**guided_options)
+            if guided_options.guided_decoding_backend is None:
+                decoding_config = self.llm_engine.get_decoding_config()
+                guided_options.guided_decoding_backend = (
+                    decoding_config.guided_decoding_backend)
+            guided_logits_processor = get_guided_decoding_logits_processor(
+                guided_options, self.get_tokenizer())
+            if guided_logits_processor:
+                if params.logits_processors is None:
+                    params.logits_processors = []
+                params.logits_processors.append(guided_logits_processor)
+        return params
+
+    # def _add_guided_processor(self, params: SamplingParams):
+    #     if options := params.guided_options:
+    #         if isinstance(options, dict):
+    #             options = GuidedDecodingFields(**options)
+    #         if options.guided_decoding_backend is None:
+    #             decoding_config = self.llm_engine.get_decoding_config()
+    #             options.guided_decoding_backend = (
+    #                 decoding_config.guided_decoding_backend)
+    #         guided_logits_processor = get_guided_decoding_logits_processor(
+    #             options, self.get_tokenizer())
+    #         if guided_logits_processor:
+    #             if params.logits_processors is None:
+    #                 params.logits_processors = []
+    #             params.logits_processors.append(guided_logits_processor)
+    #     return params
+
     def _add_request(
         self,
         inputs: PromptInputs,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 744e1d94511b3..ff022ae4fa44f 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -25,7 +25,7 @@
 from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
+    GuidedDecodingFields, get_guided_decoding_logits_processor_async, get_guided_decoding_logits_processor_async)
 from vllm.multimodal.image import ImagePixelData
 from vllm.multimodal.utils import (async_get_and_parse_image,
                                    get_full_image_text_prompt)
@@ -247,18 +247,20 @@ async def create_chat_completion(
                 add_special_tokens=request.add_special_tokens)
             sampling_params = request.to_sampling_params()
             lora_request = self._maybe_get_lora(request)
-            decoding_config = await self.engine.get_decoding_config()
-            guided_decoding_backend = request.guided_decoding_backend \
-                or decoding_config.guided_decoding_backend
-            guided_decode_logits_processor = (
-                await get_guided_decoding_logits_processor(
-                    guided_decoding_backend, request, await
-                    self.engine.get_tokenizer()))
-            if guided_decode_logits_processor:
+
+            # request = adapt_request_for_tool_use(request)
+            # options = GuidedDecodingFields.from_openai_request(request)
+            if request.guided_decoding_backend is None:
+                decoding_config = await self.engine.get_decoding_config()
+                request.guided_decoding_backend = (
+                    decoding_config.guided_decoding_backend)
+            processors = (await get_guided_decoding_logits_processor_async(
+                request, await self.engine.get_tokenizer()))
+            if processors:
                 if sampling_params.logits_processors is None:
                     sampling_params.logits_processors = []
-                sampling_params.logits_processors.append(
-                    guided_decode_logits_processor)
+                sampling_params.logits_processors.append(processors)
+
         except ValueError as e:
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index c775fa6daa739..410344f5c678c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -21,7 +21,7 @@
                                                     OpenAIServing)
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
+    GuidedDecodingFields, get_guided_decoding_logits_processor_async, get_guided_decoding_logits_processor_async)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
@@ -98,18 +98,17 @@ async def create_completion(self, request: CompletionRequest,
         try:
             sampling_params = request.to_sampling_params()
             lora_request = self._maybe_get_lora(request)
-            decoding_config = await self.engine.get_decoding_config()
-            guided_decoding_backend = request.guided_decoding_backend \
-                or decoding_config.guided_decoding_backend
-            guided_decode_logit_processor = (
-                await get_guided_decoding_logits_processor(
-                    guided_decoding_backend, request, await
-                    self.engine.get_tokenizer()))
-            if guided_decode_logit_processor is not None:
+            if request.guided_decoding_backend is None:
+                decoding_config = await self.engine.get_decoding_config()
+                request.guided_decoding_backend = (
+                    decoding_config.guided_decoding_backend)
+            processors = (await get_guided_decoding_logits_processor_async(
+                request, await self.engine.get_tokenizer()))
+            if processors is not None:
                 if sampling_params.logits_processors is None:
                     sampling_params.logits_processors = []
-                sampling_params.logits_processors.append(
-                    guided_decode_logit_processor)
+                sampling_params.logits_processors.append(processors)
+
             prompt_is_tokens, prompts = parse_prompt_format(request.prompt)
 
             for i, prompt in enumerate(prompts):
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 50aa3ec379f4a..a8041f647a4d1 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -1,33 +1,95 @@
+import asyncio
+import concurrent.futures
 from typing import Optional, Union
 
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionNamedToolChoiceParam, ChatCompletionRequest,
     CompletionRequest)
-from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (
-    get_lm_format_enforcer_guided_decoding_logits_processor)
+from vllm.model_executor.guided_decoding.fields import GuidedDecodingFields
 from vllm.model_executor.guided_decoding.outlines_decoding import (
-    get_outlines_guided_decoding_logits_processor)
+    get_outlines_guided_decoding_logits_processor, get_outlines_guided_decoding_logits_processor_async)
 from vllm.sampling_params import LogitsProcessor
 
+global_thread_pool = None
 
-async def get_guided_decoding_logits_processor(
-        guided_decoding_backend: str, request: Union[CompletionRequest,
-                                                     ChatCompletionRequest],
-        tokenizer) -> Optional[LogitsProcessor]:
-    request = _adapt_request_for_tool_use(request)
 
-    if guided_decoding_backend == 'outlines':
-        return await get_outlines_guided_decoding_logits_processor(
+async def get_guided_decoding_logits_processor_async(
+        request: Union[CompletionRequest,
+                   ChatCompletionRequest], tokenizer) -> Optional[LogitsProcessor]:
+    global global_thread_pool
+    if global_thread_pool is None:
+        global_thread_pool = concurrent.futures.ThreadPoolExecutor(
+            max_workers=4)
+    loop = asyncio.get_running_loop()
+
+    return await loop.run_in_executor(
+        global_thread_pool,
+        get_guided_decoding_logits_processor,
+        request,
+        tokenizer,
+    )
+# async def get_guided_decoding_logits_processor_async(
+#     guided_decoding_backend: str, request: Union[CompletionRequest,
+#                                                      ChatCompletionRequest],
+#         tokenizer) -> Optional[LogitsProcessor]:
+#     request = _adapt_request_for_tool_use(request)
+
+#     if guided_decoding_backend == 'outlines':
+#         return await get_outlines_guided_decoding_logits_processor_async(
+#             request, tokenizer)
+#     if guided_decoding_backend == 'lm-format-enforcer':
+#         from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
+#             get_lm_format_enforcer_guided_decoding_logits_processor)
+#         options = GuidedDecodingFields.from_openai_request(request)
+#         return get_lm_format_enforcer_guided_decoding_logits_processor(
+#             options, tokenizer)
+
+    # raise ValueError(
+    #     f"Unknown guided decoding backend '{guided_decoding_backend}'. "
+    #     "Must be one of 'outlines, 'lm-format-enforcer'")
+
+
+# async def get_guided_decoding_logits_processor(
+#         guided_decoding_backend: str, request: Union[CompletionRequest,
+#                                                      ChatCompletionRequest],
+#         tokenizer) -> Optional[LogitsProcessor]:
+#     request = _adapt_request_for_tool_use(request)
+
+#     if guided_decoding_backend == 'outlines':
+#         return await get_outlines_guided_decoding_logits_processor(
+#             request, tokenizer)
+#     if guided_decoding_backend == 'lm-format-enforcer':
+#         return await get_lm_format_enforcer_guided_decoding_logits_processor(
+#             request, tokenizer)
+
+#     raise ValueError(
+#         f"Unknown guided decoding backend '{guided_decoding_backend}'. "
+#         "Must be one of 'outlines, 'lm-format-enforcer'")
+
+
+def get_guided_decoding_logits_processor(
+        request: Union[CompletionRequest,
+                   ChatCompletionRequest, GuidedDecodingFields], tokenizer) -> Optional[LogitsProcessor]:
+    # request = _adapt_request_for_tool_use(request)
+    if request.guided_decoding_backend == 'outlines':
+        return get_outlines_guided_decoding_logits_processor(
             request, tokenizer)
-    if guided_decoding_backend == 'lm-format-enforcer':
-        return await get_lm_format_enforcer_guided_decoding_logits_processor(
+    if request.guided_decoding_backend == 'lm-format-enforcer':
+        ## Import moved inside function to avoide circular
+        ## import with vllm.entrypoints.LLM.py
+        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
+            get_lm_format_enforcer_guided_decoding_logits_processor)
+        return get_lm_format_enforcer_guided_decoding_logits_processor(
             request, tokenizer)
 
     raise ValueError(
-        f"Unknown guided decoding backend '{guided_decoding_backend}'. "
+        f"Unknown guided decoding backend '{request.guided_decoding_backend}'. "
         "Must be one of 'outlines, 'lm-format-enforcer'")
 
 
+__all__ = ['get_guided_decoding_logits_processor', 'GuidedDecodingFields']
+
+
 def _adapt_request_for_tool_use(request: Union[CompletionRequest,
                                                ChatCompletionRequest]):
     # the legacy completion API does not support tool use
diff --git a/vllm/model_executor/guided_decoding/fields.py b/vllm/model_executor/guided_decoding/fields.py
new file mode 100644
index 0000000000000..2c2f7e2f005d0
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/fields.py
@@ -0,0 +1,48 @@
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+from pydantic import BaseModel
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              CompletionRequest)
+
+
+@dataclass
+class GuidedDecodingFields:
+    """One of the fields will be used to retrieve the logit processor."""
+    guided_json: Optional[Union[Dict, BaseModel, str]] = None
+    guided_regex: Optional[str] = None
+    guided_choice: Optional[List[str]] = None
+    guided_grammar: Optional[str] = None
+    guided_decoding_backend: Optional[str] = None
+    guided_whitespace_pattern: Optional[str] = None
+    guided_json_object: Optional[bool] = None
+
+    def __post_init__(self):
+        """Validate that some fields are mutually exclusive."""
+        guide_count = sum([
+            self.guided_json is not None,
+            self.guided_regex is not None,
+            self.guided_choice is not None,
+            self.guided_grammar is not None,
+            self.guided_json_object is not None,
+        ])
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding but multiple is "
+                f"specified: {self.__dict__}")
+
+    @classmethod
+    def from_openai_request(cls, request: Union[CompletionRequest,
+                                                ChatCompletionRequest]):
+        is_json_object = (request.response_format is not None
+                          and request.response_format.type == "json_object")
+        return cls(
+            guided_json=request.guided_json,
+            guided_regex=request.guided_regex,
+            guided_choice=request.guided_choice,
+            guided_grammar=request.guided_grammar,
+            guided_decoding_backend=request.guided_decoding_backend,
+            guided_whitespace_pattern=request.guided_whitespace_pattern or " ",
+            guided_json_object=is_json_object or None,
+        )
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
index d0a5ca5592f9d..24173ddc050a6 100644
--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -1,7 +1,9 @@
 from functools import lru_cache
 from json import loads as json_loads
 from typing import Optional, Union
-
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    CompletionRequest)
 from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser,
                               RegexParser, StringParser,
                               TokenEnforcerTokenizerData, UnionParser)
@@ -10,16 +12,15 @@
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              CompletionRequest)
+from vllm.model_executor.guided_decoding.fields import GuidedDecodingFields
 from vllm.model_executor.guided_decoding.outlines_decoding import (
     get_outlines_guided_decoding_logits_processor)
 from vllm.sampling_params import LogitsProcessor
 
 
-async def get_lm_format_enforcer_guided_decoding_logits_processor(
-        request: Union[CompletionRequest, ChatCompletionRequest],
-        tokenizer) -> Optional[LogitsProcessor]:
+def get_lm_format_enforcer_guided_decoding_logits_processor(
+        request: Union[CompletionRequest,
+                   ChatCompletionRequest, GuidedDecodingFields], tokenizer) -> Optional[LogitsProcessor]:
     """
     Given an OpenAI-compatible request, check for guided decoding parameters
     and get the necessary logits processor for the given guide.
@@ -40,12 +41,11 @@ async def get_lm_format_enforcer_guided_decoding_logits_processor(
         character_level_parser = RegexParser(request.guided_regex)
     elif request.guided_grammar:
         # CFG grammar not supported by LMFE, revert to outlines
-        return await get_outlines_guided_decoding_logits_processor(
+        return get_outlines_guided_decoding_logits_processor(
             request, tokenizer)
-    elif (request.response_format is not None
-          and request.response_format.type == "json_object"):
-        character_level_parser = JsonSchemaParser(
-            None)  # None means any json object
+    elif isinstance(request, GuidedDecodingFields) and request.guided_json_object:
+        # None means any json object
+        character_level_parser = JsonSchemaParser(None)
     else:
         return None
 
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index 721f7e0530cb7..668064d3974da 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -1,18 +1,19 @@
-import asyncio
-import concurrent.futures
+from copy import copy
 from enum import Enum
+from functools import lru_cache
 from json import dumps as json_dumps
 from re import escape as regex_escape
-from typing import Tuple, Union
+from typing import Optional, Tuple, Union
 
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
-
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest)
+from vllm.model_executor.guided_decoding.fields import GuidedDecodingFields
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
-
+import concurrent.futures
+import asyncio
 
 class GuidedDecodingMode(Enum):
     JSON = "json"
@@ -51,7 +52,7 @@ class GuidedDecodingMode(Enum):
 global_thread_pool = None  # used for generating logits processor fsm
 
 
-async def get_outlines_guided_decoding_logits_processor(
+async def get_outlines_guided_decoding_logits_processor_async(
     request: Union[CompletionRequest,
                    ChatCompletionRequest], tokenizer: PreTrainedTokenizerBase
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
@@ -69,16 +70,37 @@ async def get_outlines_guided_decoding_logits_processor(
 
     if global_thread_pool is None:
         global_thread_pool = concurrent.futures.ThreadPoolExecutor(
-            max_workers=2)
+            max_workers=4)
     loop = asyncio.get_running_loop()
 
     return await loop.run_in_executor(global_thread_pool,
-                                      _get_logits_processor, guide, tokenizer,
+                                      _get_cached_logits_processor, guide, tokenizer,
                                       mode, request.guided_whitespace_pattern)
 
 
+def get_outlines_guided_decoding_logits_processor(
+        request: Union[CompletionRequest,
+                   ChatCompletionRequest, GuidedDecodingFields], tokenizer
+) -> Optional[Union[JSONLogitsProcessor, RegexLogitsProcessor]]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+    guide, mode = _get_guide_and_mode(request)
+    if not guide or not mode:
+        return None
+
+    logits_processor = copy(
+        _get_cached_logits_processor(guide, tokenizer, mode,
+                                     request.guided_whitespace_pattern))
+    return logits_processor
+
+
 def _get_guide_and_mode(
-    request: Union[CompletionRequest, ChatCompletionRequest]
+    request: Union[CompletionRequest,
+                   ChatCompletionRequest, GuidedDecodingFields]
 ) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
 
     if request.guided_json:
@@ -102,17 +124,20 @@ def _get_guide_and_mode(
         return choices_regex, GuidedDecodingMode.CHOICE
     elif request.guided_grammar:
         return request.guided_grammar, GuidedDecodingMode.GRAMMAR
-    elif (request.response_format is not None
+    elif (not isinstance(request, GuidedDecodingFields) and request.response_format is not None
           and request.response_format.type == "json_object"):
+        print("Used response format", flush=True)
         return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR
     else:
         return None, None
 
 
-def _get_logits_processor(
-    guide: str, tokenizer: PreTrainedTokenizerBase, mode: GuidedDecodingMode,
-    whitespace_pattern: Union[str, None]
-) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
+
+@lru_cache(maxsize=32)
+def _get_cached_logits_processor(guide: str,
+                                 tokenizer: PreTrainedTokenizerBase,
+                                 mode: GuidedDecodingMode,
+                                 whitespace_pattern: Union[str, None]):
     if mode == GuidedDecodingMode.JSON:
         return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern)
     elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 9d8a361353e26..81b336297c395 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -104,6 +104,8 @@ class SamplingParams:
         truncate_prompt_tokens: If set to an integer k, will use only the last k
             tokens from the prompt (i.e., left truncation). Defaults to None
             (i.e., no truncation).
+        guided_options: Configuration dictionary for guided decoding. Refer to
+            the `GuidedDecodingFields` class for the available options.
     """
 
     def __init__(
@@ -134,6 +136,7 @@ def __init__(
         spaces_between_special_tokens: bool = True,
         logits_processors: Optional[List[LogitsProcessor]] = None,
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        # guided_options: Optional[Union[Dict, "GuidedDecodingFields"]] = None
     ) -> None:
         self.n = n
         self.best_of = best_of if best_of is not None else n
@@ -182,6 +185,8 @@ def __init__(
         else:
             self.output_text_buffer_length = 0
 
+        # self.guided_options = guided_options
+
         self._verify_args()
         if self.use_beam_search:
             self._verify_beam_search()
@@ -342,4 +347,4 @@ def __repr__(self) -> str:
             f"skip_special_tokens={self.skip_special_tokens}, "
             "spaces_between_special_tokens="
             f"{self.spaces_between_special_tokens}, "
-            f"truncate_prompt_tokens={self.truncate_prompt_tokens})")
+            f"truncate_prompt_tokens={self.truncate_prompt_tokens}),")